Importing Data and Data Cleaning

(1) Import the data set using a Tidyverse function and NOT with a Base R function. How many rows and columns are in the data set?

library(tidyverse)
library(lubridate)
library(glue)
library(knitr)
getwd()
## [1] "C:/Users/katie/OneDrive - University of Cincinnati/FS20/Second Half/Data Wrangling (BANA 7025)/Week 4"
# Import the data set using a Tidyverse function and NOT with a Base R function.  How many rows and columns are in the data set?
county_data <- readr::read_csv('homework3/acs_2015_county_data_revised.csv')
nrow(county_data)
## [1] 3142
ncol(county_data)
## [1] 35

In the data set, there are 3,142 rows and 35 columns.

(2) Do any data types need to be changed? Show any code to change variable types and show code / output for a glimpse() command after you’re finished.

str(county_data)
## tibble [3,142 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ census_id     : num [1:3142] 1001 1003 1005 1007 1009 ...
##  $ state         : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county        : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ total_pop     : num [1:3142] 55221 195121 26932 22604 57710 ...
##  $ men           : num [1:3142] 26745 95314 14497 12073 28512 ...
##  $ women         : num [1:3142] 28476 99807 12435 10531 29198 ...
##  $ hispanic      : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
##  $ white         : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
##  $ black         : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
##  $ native        : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
##  $ asian         : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
##  $ pacific       : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ...
##  $ citizen       : num [1:3142] 40725 147695 20714 17495 42345 ...
##  $ income        : num [1:3142] 51281 50254 32964 38678 45813 ...
##  $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ...
##  $ poverty       : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
##  $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
##  $ professional  : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
##  $ service       : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
##  $ office        : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
##  $ construction  : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
##  $ production    : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
##  $ drive         : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
##  $ carpool       : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
##  $ transit       : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
##  $ walk          : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
##  $ other_transp  : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
##  $ work_at_home  : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
##  $ mean_commute  : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
##  $ employed      : num [1:3142] 23986 85953 8597 8294 22189 ...
##  $ private_work  : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
##  $ public_work   : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
##  $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
##  $ family_work   : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
##  $ unemployment  : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   census_id = col_double(),
##   ..   state = col_character(),
##   ..   county = col_character(),
##   ..   total_pop = col_double(),
##   ..   men = col_double(),
##   ..   women = col_double(),
##   ..   hispanic = col_double(),
##   ..   white = col_double(),
##   ..   black = col_double(),
##   ..   native = col_double(),
##   ..   asian = col_double(),
##   ..   pacific = col_double(),
##   ..   citizen = col_double(),
##   ..   income = col_double(),
##   ..   income_per_cap = col_double(),
##   ..   poverty = col_double(),
##   ..   child_poverty = col_double(),
##   ..   professional = col_double(),
##   ..   service = col_double(),
##   ..   office = col_double(),
##   ..   construction = col_double(),
##   ..   production = col_double(),
##   ..   drive = col_double(),
##   ..   carpool = col_double(),
##   ..   transit = col_double(),
##   ..   walk = col_double(),
##   ..   other_transp = col_double(),
##   ..   work_at_home = col_double(),
##   ..   mean_commute = col_double(),
##   ..   employed = col_double(),
##   ..   private_work = col_double(),
##   ..   public_work = col_double(),
##   ..   self_employed = col_double(),
##   ..   family_work = col_double(),
##   ..   unemployment = col_double()
##   .. )
head(county_data)
## # A tibble: 6 x 35
##   census_id state county total_pop   men women hispanic white black native asian
##       <dbl> <chr> <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1      1001 Alab~ Autau~     55221 26745 28476      2.6  75.8  18.5    0.4   1  
## 2      1003 Alab~ Baldw~    195121 95314 99807      4.5  83.1   9.5    0.6   0.7
## 3      1005 Alab~ Barbo~     26932 14497 12435      4.6  46.2  46.7    0.2   0.4
## 4      1007 Alab~ Bibb       22604 12073 10531      2.2  74.5  21.4    0.4   0.1
## 5      1009 Alab~ Blount     57710 28512 29198      8.6  87.9   1.5    0.3   0.1
## 6      1011 Alab~ Bullo~     10678  5660  5018      4.4  22.2  70.7    1.2   0.2
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## #   family_work <dbl>, unemployment <dbl>
county_data$census_id <- as.character(county_data$census_id)
county_data$state <- as.factor(county_data$state)

It appears that no data types need to be changed except the census_id and state variables. Because the census_id variable will not need any data visualizations or summary statistics and is unique for each record, I will be changing the data type to a character. Additionally, I am going to change the state variable to a factor data type, as this bucket of information is largely categorical in nature.

Any variable that is a double (dbl) type is also numeric.

glimpse(county_data)
## Rows: 3,142
## Columns: 35
## $ census_id      <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state          <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama...
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...

(3) Are there any missing values? How will you handle missing values? Will you impute a missing value with, for example, a mean or median value for the entire column, or will you remove the entire observation? Give a rationale for your decision and show any code/output to handle missing values.

colSums(is.na(county_data))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

There are only missing values in the income and child_poverty columns, for a total of two missing values. Because these are only two values out of over 3000, I will simply drop these observations entirely. It would not make sense to impute these values with a mean or median, as the two missing values will not greatly impact the data set.

county_data <- drop_na(county_data)

## Check to see if there are any missing values
colSums(is.na(county_data))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              0              0 
##        poverty  child_poverty   professional        service         office 
##              0              0              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0
nrow(county_data)
## [1] 3140
ncol(county_data)
## [1] 35

As evident in the code chunk above, there are now 3,140 rows and 35 columns.

(4) Use the summary() function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code / output to handle unusual values.

summary(county_data)
##   census_id              state         county            total_pop       
##  Length:3140        Texas   : 253   Length:3140        Min.   :     267  
##  Class :character   Georgia : 159   Class :character   1st Qu.:   11036  
##  Mode  :character   Virginia: 133   Mode  :character   Median :   25793  
##                     Kentucky: 120                      Mean   :  100801  
##                     Missouri: 115                      3rd Qu.:   67620  
##                     Kansas  : 105                      Max.   :10038388  
##                     (Other) :2255                                        
##       men              women            hispanic          white      
##  Min.   :    136   Min.   :    131   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5551   1st Qu.:   5488   1st Qu.: 1.900   1st Qu.:65.67  
##  Median :  12838   Median :  12916   Median : 3.700   Median :84.65  
##  Mean   :  49597   Mean   :  51204   Mean   : 8.819   Mean   :77.31  
##  3rd Qu.:  33328   3rd Qu.:  34123   3rd Qu.: 9.000   3rd Qu.:93.33  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.885   Mean   : 1.763   Mean   : 1.253   Mean   : 0.07357  
##  3rd Qu.:10.200   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :11.10000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :    199   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8276   1st Qu.: 38826   1st Qu.:20470   1st Qu.:12.0  
##  Median :  19454   Median : 45095   Median :23575   Median :16.0  
##  Mean   :  70849   Mean   : 46824   Mean   :24331   Mean   :16.7  
##  3rd Qu.:  50795   3rd Qu.: 52248   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                                                                   
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.05   Mean   :18.25   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.42   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##                                                                 
##   construction     production        drive         carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.2   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.50   1st Qu.:76.6   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.6   Median : 9.90  
##  Mean   :12.75   Mean   :15.82   Mean   :79.1   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.6   3rd Qu.:11.90  
##  Max.   :40.30   Max.   :55.60   Max.   :94.6   Max.   :29.90  
##                                                                
##     transit             walk         other_transp    work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.90   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.30   Median : 4.000  
##  Mean   : 0.9681   Mean   : 3.294   Mean   : 1.61   Mean   : 4.694  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.90   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.10   Max.   :37.200  
##                                                                     
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :    166   Min.   :29.50   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4532   1st Qu.:70.90   1st Qu.:13.07  
##  Median :22.90   Median :  10657   Median :75.85   Median :16.10  
##  Mean   :23.15   Mean   :  46416   Mean   :74.45   Mean   :17.33  
##  3rd Qu.:26.60   3rd Qu.:  29272   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.922   Mean   :0.2917   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
## 

Looking at the above summary, there are a few variables that have unusual values. When compared to the data dictionary, the employed variable appears to be incorrectly recorded. This is because the data dictionary defines the variable as, “Percentage employed, ages 16+”. However, the Min to Max ranges from 166 to 4,635,465, which is not between 0 and 100 (as a typical percentage would be). During my analysis, I will ignore this variable and / or drop it.

Additionally, more unusual values appear in the form of outliers. Outliers appear to exist in almost every variable, as seen in examples total_pop, men, women, citizen, etc. These outliers could be proven through plots and visualizations in further analysis.

Data Manipulation and Insights

(5) How many counties have more women than men?

county_data %>% 
  filter(women > men) %>% 
  nrow()
## [1] 1984

There are 1,984 counties that have more women than men.

(6) How many counties have an unemployment rate lower than 10%?

county_data %>% 
  filter(unemployment < 10) %>% 
           nrow()
## [1] 2419

There are 2,419 counties with an unemployment rate lower than 10%.

(7) What are the top 10 counties with the highest mean commute? Show the census ID, county name, state, and the mean_commute in your final answer (sorted by mean_commute).

kable(county_data %>% 
  select(census_id, county, state, mean_commute) %>%
  arrange(desc(mean_commute)) %>% 
  top_n(10))
census_id county state mean_commute
42103 Pike Pennsylvania 44.0
36005 Bronx New York 43.0
24017 Charles Maryland 42.8
51187 Warren Virginia 42.7
36081 Queens New York 42.6
36085 Richmond New York 42.6
51193 Westmoreland Virginia 42.5
8093 Park Colorado 42.4
36047 Kings New York 41.7
54015 Clay West Virginia 41.4

The top ten counties with the highest mean commute are seen in the table above.

(8) Create a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages. Show the census ID, county name, state,and the percentage in your final answer (sorted by ascending percentage).

kable(county_data %>% 
  mutate(percentage_of_women = women/total_pop) %>% 
  select(census_id, county, state, percentage_of_women) %>% 
  arrange(percentage_of_women) %>% 
  top_n(-10))
census_id county state percentage_of_women
42053 Forest Pennsylvania 0.2677747
8011 Bent Colorado 0.3136556
51183 Sussex Virginia 0.3147336
13309 Wheeler Georgia 0.3210156
6035 Lassen California 0.3316588
48095 Concho Texas 0.3328439
13053 Chattahoochee Georgia 0.3335572
2013 Aleutians East Borough Alaska 0.3347458
22125 West Feliciana Louisiana 0.3364904
32027 Pershing Nevada 0.3372508

The top ten counties with the lowest percentage of women are seen in the table above.

(9) Create a new variable that calculates the sum of all race percentage variables (these columns are the “hispanic”, “white”, “black”, “native”, “asian”, and “pacific” variables).

# The new variable is named "percentage_race_total"
county_data <- mutate(county_data, percentage_race_total = hispanic + white + black + native + asian + pacific)

kable(county_data %>% 
  select(percentage_race_total, everything()) %>% 
  arrange(percentage_race_total) %>% 
  top_n(10))
percentage_race_total census_id state county total_pop men women hispanic white black native asian pacific citizen income income_per_cap poverty child_poverty professional service office construction production drive carpool transit walk other_transp work_at_home mean_commute employed private_work public_work self_employed family_work unemployment
89.0 46121 South Dakota Todd 9942 4862 5080 3.7 10.0 0.3 74.4 0.6 0.0 5915 31128 11616 46.4 56.0 35.5 25.0 22.0 10.5 7.1 58.3 18.0 1.0 6.0 5.0 11.7 16.5 2901 34.4 55.0 9.8 0.8 25.4
93.8 2158 Alaska Kusilvak Census Area 7914 4200 3714 1.0 4.5 0.4 87.4 0.4 0.1 4680 38229 11569 33.3 39.7 29.9 22.8 22.4 14.6 10.3 5.2 3.3 0.1 47.6 39.1 4.7 5.9 2067 45.4 53.8 0.5 0.3 28.6
97.6 1035 Alabama Conecuh 12865 6176 6689 1.6 51.0 44.7 0.3 0.0 0.0 9930 24900 15968 33.8 41.5 19.4 21.0 20.4 11.4 27.9 93.6 4.3 0.0 0.6 0.3 1.3 29.7 3718 77.8 13.0 8.3 0.8 22.6
98.2 46041 South Dakota Dewey 5579 2718 2861 0.9 21.7 0.0 75.6 0.0 0.0 3644 37206 16216 27.8 33.7 42.4 20.1 17.4 13.7 6.5 73.1 3.1 1.9 5.2 1.6 15.2 16.3 1900 34.9 49.2 14.7 1.2 24.6
98.4 46102 South Dakota Oglala Lakota 14153 6809 7344 1.0 4.8 0.5 92.1 0.0 0.0 8903 26369 9150 53.3 61.5 37.2 26.0 19.7 7.2 10.0 64.1 18.5 0.5 8.7 2.3 5.8 17.7 3226 29.5 66.2 4.3 0.0 28.7
98.4 8025 Colorado Crowley 5551 3145 2406 30.2 61.1 5.3 0.9 0.4 0.5 4169 31151 10742 33.4 40.9 24.4 28.8 22.7 11.5 12.6 79.5 10.4 0.0 7.3 0.5 2.3 21.9 1457 66.7 27.3 6.0 0.0 27.0
98.9 46031 South Dakota Corson 4149 2123 2026 0.6 30.9 0.3 66.8 0.3 0.0 2706 31676 13848 45.6 56.9 43.0 14.9 18.7 15.8 7.6 63.0 5.9 1.5 8.9 1.5 19.0 19.7 1240 36.1 40.8 21.5 1.5 29.4
99.0 46137 South Dakota Ziebach 2833 1391 1442 3.7 23.7 0.3 70.6 0.7 0.0 1796 35119 12877 39.5 47.2 48.7 12.9 17.5 15.8 5.1 65.3 5.7 0.7 6.6 1.7 20.0 19.6 948 40.9 41.7 16.6 0.8 27.4
99.4 28119 Mississippi Quitman 7761 3751 4010 0.9 28.3 70.2 0.0 0.0 0.0 5751 24583 14503 39.5 60.8 30.1 27.0 17.0 11.9 14.1 81.2 13.0 0.2 2.9 0.1 2.5 23.7 2324 68.1 24.4 7.5 0.0 25.3
99.8 45005 South Carolina Allendale 9838 5225 4613 2.8 23.2 73.8 0.0 0.0 0.0 7758 25327 12199 29.0 48.1 17.2 26.0 22.6 9.0 25.3 81.4 11.3 0.0 3.4 0.5 3.4 27.5 2866 73.3 21.9 4.8 0.0 22.6
99.9 28053 Mississippi Humphreys 8984 4278 4706 2.6 22.0 75.1 0.1 0.1 0.0 6469 23216 13503 39.7 49.1 24.7 17.6 17.3 13.6 26.9 77.5 17.8 0.0 2.6 0.7 1.5 21.4 2678 74.3 18.9 6.6 0.2 26.8

The top ten counties with the lowest sum of these race percentage variables are seen above.

kable(county_data %>% 
  select(percentage_race_total, everything()) %>% 
  group_by(state) %>% 
  summarise(avg = mean(percentage_race_total)) %>% 
  arrange(avg) %>% 
  top_n(1))
state avg
Mississippi 99.18415

On average, Mississippi has the lowest sum of these race percentage variables as seen above.

county_data %>% 
  filter(percentage_race_total > 100) %>% 
  nrow()
## [1] 11

There are 11 counties that have a sum greater than 100%.

county_data %>% 
  filter(percentage_race_total == 100) %>% 
  nrow()
## [1] 27

There are 27 counties that have a sum equal to 100%.

(10) Using the carpool variable,

county_data$carpool_rank = min_rank(-(county_data$carpool))

kable(county_data %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank) %>% 
  top_n(-10, wt=carpool_rank))
census_id county state carpool carpool_rank
13061 Clay Georgia 29.9 1
18087 LaGrange Indiana 27.0 2
13165 Jenkins Georgia 25.3 3
5133 Sevier Arkansas 24.4 4
20175 Seward Kansas 23.4 5
48079 Cochran Texas 22.8 6
48247 Jim Hogg Texas 22.6 7
48393 Roberts Texas 22.4 8
39075 Holmes Ohio 21.8 9
21197 Powell Kentucky 21.6 10

The ten highest ranked counties for carpooling can be seen above.

kable(county_data %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank) %>% 
  top_n(10, wt=carpool_rank))
census_id county state carpool carpool_rank
46069 Hyde South Dakota 2.8 3130
51720 Norton city Virginia 2.8 3130
30019 Daniels Montana 2.6 3132
31057 Dundy Nebraska 2.6 3132
13309 Wheeler Georgia 2.3 3134
38029 Emmons North Dakota 2.3 3134
36061 New York New York 1.9 3136
31183 Wheeler Nebraska 1.3 3137
48235 Irion Texas 0.9 3138
48261 Kenedy Texas 0.0 3139
48269 King Texas 0.0 3139

The ten lowest ranked counties for carpooling can be seen above.

kable(county_data %>% 
  group_by(state) %>% 
  summarise(average_rank = mean(carpool_rank)) %>% 
  arrange(average_rank) %>% 
  top_n(-1))
state average_rank
Hawaii 651.25

On average, the state best ranked for carpooling is Hawaii.

kable(county_data %>% 
  group_by(state) %>% 
  summarise(average_rank = mean(carpool_rank)) %>% 
  arrange(average_rank) %>% 
  top_n(-5))
state average_rank
Hawaii 651.250
Arizona 970.400
Utah 1018.931
Arkansas 1054.307
Alaska 1086.379

On average, the best top five states for carpooling can be seen above.