#import necessary packages
library(lubridate)
library(glue)
library(tidyverse)
library(dplyr)
library(tidyr)

Importing Data and Data Cleaning

The data for this assignment is from an annual survey called the American Community Survey (ACS) that is collected by the United States Census Bureau every year. It shows five year estimates for variables in the dataset for each politically designated county in the United States.

#import the data
data <- read_csv("acs_2015_county_data_revised.csv", col_names = TRUE)

There are 3,142 rows and 35 columns in this data set.

#check rows
nrow(data)
## [1] 3142
#check columns
ncol(data)
## [1] 35

Changing Variable Types

The majority of variable types are numeric and represent population totals or percentages. State and county are character variables. The only variable that I think needs to be changed is the census_id. It is originally a numeric double type, but since it doesn’t represent a number, it is just a unique identifier, I will change it to character type.

#change census_id to character type
data$census_id <- as.character(data$census_id)

Below is a snapshot of the dataset.

glimpse(data)
## Observations: 3,142
## Variables: 35
## $ census_id      <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...

Handling Missing Values

There are only two missing values in this data set: one in child_poverty and one in income. Because this is such a low number (i.e. less than 1% of the data), I will remove these two observations.

#check missing values per column
colSums(is.na(data))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

After removing these two observations, there are no more missing values in the data set.

#remove rows with missing values
data <- data[complete.cases(data), ]

#check for missing values again
colSums(is.na(data))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              0              0 
##        poverty  child_poverty   professional        service         office 
##              0              0              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

Data Summary

The only variable that seems to have strange values is employed. According to the data dictionary, this variable is supposed to be the percentage of people employed, ages 16+. However, looking at the summary for this variable, the numbers are not in terms of percents but instead appear to be the total number of people ages 16+ employed.

summary(data)
##   census_id            state              county            total_pop       
##  Length:3140        Length:3140        Length:3140        Min.   :     267  
##  Class :character   Class :character   Class :character   1st Qu.:   11036  
##  Mode  :character   Mode  :character   Mode  :character   Median :   25793  
##                                                           Mean   :  100801  
##                                                           3rd Qu.:   67620  
##                                                           Max.   :10038388  
##       men              women            hispanic          white      
##  Min.   :    136   Min.   :    131   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5551   1st Qu.:   5488   1st Qu.: 1.900   1st Qu.:65.67  
##  Median :  12838   Median :  12916   Median : 3.700   Median :84.65  
##  Mean   :  49597   Mean   :  51204   Mean   : 8.819   Mean   :77.31  
##  3rd Qu.:  33328   3rd Qu.:  34123   3rd Qu.: 9.000   3rd Qu.:93.33  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.885   Mean   : 1.763   Mean   : 1.253   Mean   : 0.07357  
##  3rd Qu.:10.200   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :11.10000  
##     citizen            income       income_per_cap     poverty    
##  Min.   :    199   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8276   1st Qu.: 38826   1st Qu.:20470   1st Qu.:12.0  
##  Median :  19454   Median : 45095   Median :23575   Median :16.0  
##  Mean   :  70849   Mean   : 46824   Mean   :24331   Mean   :16.7  
##  3rd Qu.:  50795   3rd Qu.: 52248   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.05   Mean   :18.25   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.42   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##   construction     production        drive         carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.2   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.50   1st Qu.:76.6   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.6   Median : 9.90  
##  Mean   :12.75   Mean   :15.82   Mean   :79.1   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.6   3rd Qu.:11.90  
##  Max.   :40.30   Max.   :55.60   Max.   :94.6   Max.   :29.90  
##     transit             walk         other_transp    work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.90   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.30   Median : 4.000  
##  Mean   : 0.9681   Mean   : 3.294   Mean   : 1.61   Mean   : 4.694  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.90   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.10   Max.   :37.200  
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :    166   Min.   :29.50   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4532   1st Qu.:70.90   1st Qu.:13.07  
##  Median :22.90   Median :  10657   Median :75.85   Median :16.10  
##  Mean   :23.15   Mean   :  46416   Mean   :74.45   Mean   :17.33  
##  3rd Qu.:26.60   3rd Qu.:  29272   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.922   Mean   :0.2917   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400

I will change this variable to a percantage by dividing the current number in the column by the population of that county.Now the values for employed fall between 0 and 100 like the other percentage columns.

data$employed <- ((data$employed/data$total_pop)*100)
summary(data$employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.57   39.01   43.66   43.36   48.19   76.24

Data Manipulation and Insights

  1. There are 1,984 counties that have more women than men in this dataset.
data %>% 
  group_by(county) %>%
  select(county, women, men) %>% 
  filter(women>men)
## # A tibble: 1,984 x 3
## # Groups:   county [1,248]
##    county   women   men
##    <chr>    <dbl> <dbl>
##  1 Autauga  28476 26745
##  2 Baldwin  99807 95314
##  3 Blount   29198 28512
##  4 Butler   10852  9502
##  5 Calhoun  60374 56274
##  6 Chambers 17821 16258
##  7 Cherokee 13033 12975
##  8 Chilton  22200 21619
##  9 Choctaw   7013  6382
## 10 Clarke   13236 11834
## # ... with 1,974 more rows
  1. There are 2,419 counties that have an unemployment rate lower than 10%.
data %>%
  group_by(county) %>% 
  select(county, unemployment) %>% 
  filter(unemployment<10)
## # A tibble: 2,419 x 2
## # Groups:   county [1,482]
##    county   unemployment
##    <chr>           <dbl>
##  1 Autauga           7.6
##  2 Baldwin           7.5
##  3 Bibb              8.3
##  4 Blount            7.7
##  5 Chambers          8.9
##  6 Cherokee          7.9
##  7 Chilton           9.1
##  8 Clay              9.4
##  9 Cleburne          8.3
## 10 Coffee            7.1
## # ... with 2,409 more rows
  1. The top 10 counties with the highest mean commute are Pike, Bronx, Charles, Warren, Queens, Richmond, Westmoreland, Park, Kings, and Clay.
top_n(data,10,mean_commute) %>% 
  select(census_id, county, state, mean_commute) %>% 
  arrange(desc(mean_commute))
## # A tibble: 10 x 4
##    census_id county       state         mean_commute
##    <chr>     <chr>        <chr>                <dbl>
##  1 42103     Pike         Pennsylvania          44  
##  2 36005     Bronx        New York              43  
##  3 24017     Charles      Maryland              42.8
##  4 51187     Warren       Virginia              42.7
##  5 36081     Queens       New York              42.6
##  6 36085     Richmond     New York              42.6
##  7 51193     Westmoreland Virginia              42.5
##  8 8093      Park         Colorado              42.4
##  9 36047     Kings        New York              41.7
## 10 54015     Clay         West Virginia         41.4
  1. A new variable called women_pct was created to calculate the percentage of women in each county. The 10 counties with the lowest percentage of women are shown below.
data %>% mutate(
       women_pct=women/total_pop*100) %>% 
   top_n(-10,women_pct) %>% 
  select(census_id, county, state, women_pct) %>% 
  arrange(women_pct)
## # A tibble: 10 x 4
##    census_id county                 state        women_pct
##    <chr>     <chr>                  <chr>            <dbl>
##  1 42053     Forest                 Pennsylvania      26.8
##  2 8011      Bent                   Colorado          31.4
##  3 51183     Sussex                 Virginia          31.5
##  4 13309     Wheeler                Georgia           32.1
##  5 6035      Lassen                 California        33.2
##  6 48095     Concho                 Texas             33.3
##  7 13053     Chattahoochee          Georgia           33.4
##  8 2013      Aleutians East Borough Alaska            33.5
##  9 22125     West Feliciana         Louisiana         33.6
## 10 32027     Pershing               Nevada            33.7
  1. A new variable called race_sum was created to calculate the sum of all the race percentage variables. The 10 counties with the lowest race_sum are shown below.
data %>% mutate(
       race_sum=hispanic+white+black+native+asian+pacific) %>% 
   top_n(-10,race_sum) %>% 
  select(census_id, county, state, race_sum) %>% 
  arrange(race_sum)
## # A tibble: 10 x 4
##    census_id county                   state     race_sum
##    <chr>     <chr>                    <chr>        <dbl>
##  1 15001     Hawaii                   Hawaii        76.4
##  2 15009     Maui                     Hawaii        79.2
##  3 40097     Mayes                    Oklahoma      79.7
##  4 15003     Honolulu                 Hawaii        81.5
##  5 40123     Pontotoc                 Oklahoma      82.8
##  6 47061     Grundy                   Tennessee     83. 
##  7 2282      Yakutat City and Borough Alaska        83.4
##  8 40069     Johnston                 Oklahoma      84  
##  9 15007     Kauai                    Hawaii        84.1
## 10 40003     Alfalfa                  Oklahoma      85.1

On average, Hawaii has the lowest sum of the race percentage variables.

data %>% mutate(
       race_sum=hispanic+white+black+native+asian+pacific) %>% 
  group_by(state) %>% 
  summarize(race_sum_mean=mean(race_sum)) %>% 
  select(state, race_sum_mean) %>% 
  arrange(race_sum_mean)
## # A tibble: 51 x 2
##    state                race_sum_mean
##    <chr>                        <dbl>
##  1 Hawaii                        80.3
##  2 Alaska                        92.7
##  3 Oklahoma                      92.8
##  4 Washington                    96.7
##  5 California                    96.9
##  6 Oregon                        97.1
##  7 Delaware                      97.3
##  8 Massachusetts                 97.5
##  9 Maryland                      97.6
## 10 District of Columbia          97.6
## # ... with 41 more rows

There are 5 counties in the data set that have a sum of these race percentage variables that are 100.1%. They are Gosper, Hooker, Bailey, Edwards, and Nance counties.

data %>% mutate(
       race_sum=hispanic+white+black+native+asian+pacific) %>% 
   filter(race_sum>100.0) %>% 
  select(census_id, county, state, race_sum) %>% 
  arrange(desc(race_sum))
## # A tibble: 11 x 4
##    census_id county    state       race_sum
##    <chr>     <chr>     <chr>          <dbl>
##  1 31073     Gosper    Nebraska        100.
##  2 31091     Hooker    Nebraska        100.
##  3 48017     Bailey    Texas           100.
##  4 48137     Edwards   Texas           100.
##  5 31125     Nance     Nebraska        100.
##  6 28021     Claiborne Mississippi     100.
##  7 48131     Duval     Texas           100.
##  8 48261     Kenedy    Texas           100.
##  9 48263     Kent      Texas           100.
## 10 48377     Presidio  Texas           100.
## 11 49001     Beaver    Utah            100.

There are 13 states that have a sum that equals exactly 100%.

data %>% mutate(
       race_sum=hispanic+white+black+native+asian+pacific) %>% 
     group_by(state, race_sum) %>% 
  filter(race_sum==100) %>% 
select(state, race_sum)
## # A tibble: 27 x 2
## # Groups:   state, race_sum [13]
##    state       race_sum
##    <chr>          <dbl>
##  1 Alabama          100
##  2 Alabama          100
##  3 Georgia          100
##  4 Georgia          100
##  5 Kansas           100
##  6 Kentucky         100
##  7 Mississippi      100
##  8 Montana          100
##  9 Montana          100
## 10 Montana          100
## # ... with 17 more rows
  1. A new variable called carpool_rank was created where the county with the highest carpool value has rank=1. The 10 counties that are the best for carpooling based on these rankings are listed below.
data %>% 
  mutate(carpool_rank = min_rank(-carpool)) %>% 
  top_n(-10, carpool_rank) %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank)
## # A tibble: 10 x 5
##    census_id county   state    carpool carpool_rank
##    <chr>     <chr>    <chr>      <dbl>        <int>
##  1 13061     Clay     Georgia     29.9            1
##  2 18087     LaGrange Indiana     27              2
##  3 13165     Jenkins  Georgia     25.3            3
##  4 5133      Sevier   Arkansas    24.4            4
##  5 20175     Seward   Kansas      23.4            5
##  6 48079     Cochran  Texas       22.8            6
##  7 48247     Jim Hogg Texas       22.6            7
##  8 48393     Roberts  Texas       22.4            8
##  9 39075     Holmes   Ohio        21.8            9
## 10 21197     Powell   Kentucky    21.6           10

The 10 lowest ranked counties for carpooling are listed in this table below.

data %>% 
  mutate(carpool_rank = min_rank(-carpool)) %>% 
  top_n(10, carpool_rank) %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(desc(carpool_rank))
## # A tibble: 11 x 5
##    census_id county      state        carpool carpool_rank
##    <chr>     <chr>       <chr>          <dbl>        <int>
##  1 48261     Kenedy      Texas            0           3139
##  2 48269     King        Texas            0           3139
##  3 48235     Irion       Texas            0.9         3138
##  4 31183     Wheeler     Nebraska         1.3         3137
##  5 36061     New York    New York         1.9         3136
##  6 13309     Wheeler     Georgia          2.3         3134
##  7 38029     Emmons      North Dakota     2.3         3134
##  8 30019     Daniels     Montana          2.6         3132
##  9 31057     Dundy       Nebraska         2.6         3132
## 10 46069     Hyde        South Dakota     2.8         3130
## 11 51720     Norton city Virginia         2.8         3130

On average Hawaii is the state ranked the best for carpooling. and the top five states for carpooling are Hawaii, Alaska, Arkansas, Utah, and Texas.

data %>%
    group_by(state) %>%
    summarise(mean_carpool = mean(carpool)) %>% 
  mutate(carpool_rank=min_rank(-mean_carpool)) %>% 
  arrange(desc(mean_carpool))
## # A tibble: 51 x 3
##    state    mean_carpool carpool_rank
##    <chr>           <dbl>        <int>
##  1 Hawaii           12.8            1
##  2 Alaska           12.1            2
##  3 Arkansas         11.9            3
##  4 Utah             11.9            4
##  5 Texas            11.8            5
##  6 Nevada           11.7            6
##  7 Arizona          11.6            7
##  8 Missouri         11.5            8
##  9 Wyoming          11.5            9
## 10 Idaho            11.4           10
## # ... with 41 more rows