Homework 3

library(tidyverse)
library(lubridate)
library(glue)

Importing Data and Data Cleaning

Import the data set using a Tidyverse function and NOT with a Base R function. How many rows and columns are in the data set?

setwd("C:/Users/Adam Deuber/OneDrive/UC/BANA Masters/Data Wrangling/Data Wrangling/Week 4/Week 4")
county_data <- readr::read_csv("acs_2015_county_data_revised.csv")
head(county_data)

## # A tibble: 6 x 35
##   census_id state county total_pop   men women hispanic white black native asian
##       <dbl> <chr> <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1      1001 Alab~ Autau~     55221 26745 28476      2.6  75.8  18.5    0.4   1  
## 2      1003 Alab~ Baldw~    195121 95314 99807      4.5  83.1   9.5    0.6   0.7
## 3      1005 Alab~ Barbo~     26932 14497 12435      4.6  46.2  46.7    0.2   0.4
## 4      1007 Alab~ Bibb       22604 12073 10531      2.2  74.5  21.4    0.4   0.1
## 5      1009 Alab~ Blount     57710 28512 29198      8.6  87.9   1.5    0.3   0.1
## 6      1011 Alab~ Bullo~     10678  5660  5018      4.4  22.2  70.7    1.2   0.2
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## #   family_work <dbl>, unemployment <dbl>

nrow(county_data)

## [1] 3142

ncol(county_data)

## [1] 35

There are 3142 rows and 35 columns.

Do any data types need changed? Show any code to change variable types and showcode/output for a command after you’re finished.

glimpse(county_data)

## Observations: 3,142
## Variables: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017...
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...

county_data$census_id <- as.character(county_data$census_id)
county_data$state <- as.factor(county_data$state)

A census ID does not require statistical breakdowns. As such, I switched it to a character to allow it to be a label for all of the observations.

Additionally, I changed state to be a factor as it is a categorical variable and the observations can be collapsed into individual states.

Are there any missing values? How will you handle missing values? Will you impute a missing value with, for example, a mean or median value for the entire column, or will you remove the entire observation? Give a rationale for your decision and show any code/output to handle missing values.

colSums(is.na(county_data))

##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

county_data <- drop_na(county_data)
colSums(is.na(county_data))

##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              0              0 
##        poverty  child_poverty   professional        service         office 
##              0              0              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

There were only two observations with NA values. One was in income, and the other was in child_poverty. With just two rows in a dataset of over 3000, I decided to remove these from my data as seen in the code.

Use the function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code/output to handle unusual values.

summary(county_data)

##   census_id              state         county            total_pop       
##  Length:3140        Texas   : 253   Length:3140        Min.   :     267  
##  Class :character   Georgia : 159   Class :character   1st Qu.:   11036  
##  Mode  :character   Virginia: 133   Mode  :character   Median :   25793  
##                     Kentucky: 120                      Mean   :  100801  
##                     Missouri: 115                      3rd Qu.:   67620  
##                     Kansas  : 105                      Max.   :10038388  
##                     (Other) :2255                                        
##       men              women            hispanic          white      
##  Min.   :    136   Min.   :    131   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5551   1st Qu.:   5488   1st Qu.: 1.900   1st Qu.:65.67  
##  Median :  12838   Median :  12916   Median : 3.700   Median :84.65  
##  Mean   :  49597   Mean   :  51204   Mean   : 8.819   Mean   :77.31  
##  3rd Qu.:  33328   3rd Qu.:  34123   3rd Qu.: 9.000   3rd Qu.:93.33  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.885   Mean   : 1.763   Mean   : 1.253   Mean   : 0.07357  
##  3rd Qu.:10.200   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :11.10000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :    199   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8276   1st Qu.: 38826   1st Qu.:20470   1st Qu.:12.0  
##  Median :  19454   Median : 45095   Median :23575   Median :16.0  
##  Mean   :  70849   Mean   : 46824   Mean   :24331   Mean   :16.7  
##  3rd Qu.:  50795   3rd Qu.: 52248   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                                                                   
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.05   Mean   :18.25   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.42   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##                                                                 
##   construction     production        drive         carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.2   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.50   1st Qu.:76.6   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.6   Median : 9.90  
##  Mean   :12.75   Mean   :15.82   Mean   :79.1   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.6   3rd Qu.:11.90  
##  Max.   :40.30   Max.   :55.60   Max.   :94.6   Max.   :29.90  
##                                                                
##     transit             walk         other_transp    work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.90   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.30   Median : 4.000  
##  Mean   : 0.9681   Mean   : 3.294   Mean   : 1.61   Mean   : 4.694  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.90   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.10   Max.   :37.200  
##                                                                     
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :    166   Min.   :29.50   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4532   1st Qu.:70.90   1st Qu.:13.07  
##  Median :22.90   Median :  10657   Median :75.85   Median :16.10  
##  Mean   :23.15   Mean   :  46416   Mean   :74.45   Mean   :17.33  
##  3rd Qu.:26.60   3rd Qu.:  29272   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.922   Mean   :0.2917   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
##

The employed variable is listed as a percentage in the data dictionary. However, the minimum value is 166 and maximum is 4,635,465. This is not between 0 to 100 and is reflective as a percentage.

Additionally, outliers look likely in several columns. Further analysis would showcase these findings.

Data Manipulation and Insights

How many counties have more women than men?

county_data %>% 
  filter(women > men) %>% 
  nrow()

## [1] 1984

There are 1984 counties with more women than men.

How many counties have an unemployment rate lower than 10%?

county_data %>% 
  filter(unemployment < 10.0) %>% 
  nrow()

## [1] 2419

There are 2419 counties with an unemployment rate under than 10%.

What are the top 10 counties with the highest mean commute? Show the census ID, countyname, state, and the mean_commute in your final answer (sorted by mean_commute).

county_data %>%
  select(census_id, county, state, mean_commute) %>% 
  arrange(desc(mean_commute)) %>% 
  top_n(10)

## # A tibble: 10 x 4
##    census_id county       state         mean_commute
##    <chr>     <chr>        <fct>                <dbl>
##  1 42103     Pike         Pennsylvania          44  
##  2 36005     Bronx        New York              43  
##  3 24017     Charles      Maryland              42.8
##  4 51187     Warren       Virginia              42.7
##  5 36081     Queens       New York              42.6
##  6 36085     Richmond     New York              42.6
##  7 51193     Westmoreland Virginia              42.5
##  8 8093      Park         Colorado              42.4
##  9 36047     Kings        New York              41.7
## 10 54015     Clay         West Virginia         41.4

Please see the output for the table with the top 10 counties with the highest mean commute.

Create a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages. Show the census ID, county name, state,and the percentage in your final answer (sorted by ascending percentage).

county_data %>% 
  mutate(women_pct = women/total_pop) %>% 
  select(census_id, county, state, women_pct) %>% 
  arrange(women_pct) %>% 
  top_n(-10)

## # A tibble: 10 x 4
##    census_id county                 state        women_pct
##    <chr>     <chr>                  <fct>            <dbl>
##  1 42053     Forest                 Pennsylvania     0.268
##  2 8011      Bent                   Colorado         0.314
##  3 51183     Sussex                 Virginia         0.315
##  4 13309     Wheeler                Georgia          0.321
##  5 6035      Lassen                 California       0.332
##  6 48095     Concho                 Texas            0.333
##  7 13053     Chattahoochee          Georgia          0.334
##  8 2013      Aleutians East Borough Alaska           0.335
##  9 22125     West Feliciana         Louisiana        0.336
## 10 32027     Pershing               Nevada           0.337

Please see the output for the table with the top 10 counties with the lowest women percentage.

Create a new variable that calculates the sum of all race percentage variables (these columns are the “hispanic”, “white”, “black”, “native”, “asian”, and “pacific” variables).

1. What are the top 10 counties with the lowest sum of these race percentage variables?

county_data <- mutate(county_data, race_tot = hispanic + white + black + native + asian + pacific)

county_data %>% 
  select(race_tot, everything()) %>% 
  arrange(race_tot) %>% 
  top_n(10)

## # A tibble: 11 x 36
##    race_tot census_id state county total_pop   men women hispanic white black
##       <dbl> <chr>     <fct> <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>
##  1     89   46121     Sout~ Todd        9942  4862  5080      3.7  10     0.3
##  2     93.8 2158      Alas~ Kusil~      7914  4200  3714      1     4.5   0.4
##  3     97.6 1035      Alab~ Conec~     12865  6176  6689      1.6  51    44.7
##  4     98.2 46041     Sout~ Dewey       5579  2718  2861      0.9  21.7   0  
##  5     98.4 46102     Sout~ Oglal~     14153  6809  7344      1     4.8   0.5
##  6     98.4 8025      Colo~ Crowl~      5551  3145  2406     30.2  61.1   5.3
##  7     98.9 46031     Sout~ Corson      4149  2123  2026      0.6  30.9   0.3
##  8     99   46137     Sout~ Zieba~      2833  1391  1442      3.7  23.7   0.3
##  9     99.4 28119     Miss~ Quitm~      7761  3751  4010      0.9  28.3  70.2
## 10     99.8 45005     Sout~ Allen~      9838  5225  4613      2.8  23.2  73.8
## 11     99.9 28053     Miss~ Humph~      8984  4278  4706      2.6  22    75.1
## # ... with 26 more variables: native <dbl>, asian <dbl>, pacific <dbl>,
## #   citizen <dbl>, income <dbl>, income_per_cap <dbl>, poverty <dbl>,
## #   child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## #   construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## #   transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## #   mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## #   self_employed <dbl>, family_work <dbl>, unemployment <dbl>

Please see the above table with the lowest ten counties by the new race_tot variable.

1. Which state, on average, has the lowest sum of these race percentage variables?

county_data %>% 
  select(race_tot, everything()) %>% 
  group_by(state) %>% 
  summarise(avg = mean(race_tot)) %>% 
  arrange(avg) %>% 
  top_n(1)

## # A tibble: 1 x 2
##   state         avg
##   <fct>       <dbl>
## 1 Mississippi  99.2

Mississippi, on average, has the lowest sum of these race percentage variables.

1. Do any counties have a sum greater than 100%?

county_data %>% 
  filter(race_tot > 100) %>% 
  nrow()

## [1] 11

11 counties have a sum greater than 100%.

1. How many states have a sum that equals exactly to 100%?

county_data %>% 
  filter(race_tot == 100) %>% 
  nrow()

## [1] 27

27 counties have a sum equal to 100%.

Using the carpool variable,

1. Use the function to create a new variable called carpool_rankwhere the highest ranked county (rank = 1) is the county with the highest carpoolvalue. Read the documentation carefully for the ranking function.

county_data$carpool_rank = min_rank(-(county_data$carpool))

1. Find the 10 highest ranked counties for carpooling. Show the census ID, countyname, state, carpool value, and carpool_rank in your final answer.

county_data %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank) %>% 
  top_n(-10, wt=carpool_rank)

## # A tibble: 10 x 5
##    census_id county   state    carpool carpool_rank
##    <chr>     <chr>    <fct>      <dbl>        <int>
##  1 13061     Clay     Georgia     29.9            1
##  2 18087     LaGrange Indiana     27              2
##  3 13165     Jenkins  Georgia     25.3            3
##  4 5133      Sevier   Arkansas    24.4            4
##  5 20175     Seward   Kansas      23.4            5
##  6 48079     Cochran  Texas       22.8            6
##  7 48247     Jim Hogg Texas       22.6            7
##  8 48393     Roberts  Texas       22.4            8
##  9 39075     Holmes   Ohio        21.8            9
## 10 21197     Powell   Kentucky    21.6           10

Please see the above table for the 10 highest ranked counties for carpooling.

1. Find the 10 lowest ranked counties for carpooling. Show the same variables in yourfinal answer.

county_data %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank) %>% 
  top_n(10, wt=carpool_rank)

## # A tibble: 11 x 5
##    census_id county      state        carpool carpool_rank
##    <chr>     <chr>       <fct>          <dbl>        <int>
##  1 46069     Hyde        South Dakota     2.8         3130
##  2 51720     Norton city Virginia         2.8         3130
##  3 30019     Daniels     Montana          2.6         3132
##  4 31057     Dundy       Nebraska         2.6         3132
##  5 13309     Wheeler     Georgia          2.3         3134
##  6 38029     Emmons      North Dakota     2.3         3134
##  7 36061     New York    New York         1.9         3136
##  8 31183     Wheeler     Nebraska         1.3         3137
##  9 48235     Irion       Texas            0.9         3138
## 10 48261     Kenedy      Texas            0           3139
## 11 48269     King        Texas            0           3139

Please see the above table for the 10 lowest ranked counties for carpooling.

1. On average, what state is the best ranked for carpooling?

county_data %>% 
  group_by(state) %>% 
  summarise(averagerank = mean(carpool_rank)) %>% 
  arrange(averagerank) %>% 
  top_n(-1)

## # A tibble: 1 x 2
##   state  averagerank
##   <fct>        <dbl>
## 1 Hawaii        651.

Hawaii is the best ranked for carpooling.

1. What are the top 5 states for carpooling?

county_data %>% 
  group_by(state) %>% 
  summarise(averagerank = mean(carpool_rank)) %>% 
  arrange(averagerank) %>% 
  top_n(-5)

## # A tibble: 5 x 2
##   state    averagerank
##   <fct>          <dbl>
## 1 Hawaii          651.
## 2 Arizona         970.
## 3 Utah           1019.
## 4 Arkansas       1054.
## 5 Alaska         1086.

Please see the above table for the top 5 states for carpooling.

Homework 3

Adam Deuber

11/7/2020

Importing Data and Data Cleaning

Data Manipulation and Insights