Importing Data and Data Cleaning

1. Importing Data Set

#install.packages(tidyverse)
library(tidyverse)

setwd("~/Desktop/Grad School/Data Wrangling/Assignment 3")

data <- read_csv("acs_2015_county_data_revised copy.csv")
nrow(data)
## [1] 3142
ncol(data)
## [1] 35
glimpse(data)
## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…

2. Changing Data Types

Looking at the variable types from the glimpse(), it doesn’t look like we need to change anything. All the character variables and integer variables are correctly identified in the table data frame.

3. Dealing With Missing Values

sum(is.na(data))
## [1] 2
colSums(is.na(data))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0
data$income[is.na(data$income)] <- median(data$income, na.rm = TRUE)

data$child_poverty[is.na(data$child_poverty)] <- median(data$child_poverty, na.rm = TRUE)

colSums(is.na(data))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              0              0 
##        poverty  child_poverty   professional        service         office 
##              0              0              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

When I checked for missing values, we see that there are only two. I computed both of those missing values using the median value. This made more sense because I did not want to skew the data by using an average.

4. Dealing With Unusual Values

summary(data)
##    census_id        state              county            total_pop       
##  Min.   : 1001   Length:3142        Length:3142        Min.   :      85  
##  1st Qu.:18178   Class :character   Class :character   1st Qu.:   11028  
##  Median :29176   Mode  :character   Mode  :character   Median :   25768  
##  Mean   :30384                                         Mean   :  100737  
##  3rd Qu.:45080                                         3rd Qu.:   67552  
##  Max.   :56045                                         Max.   :10038388  
##       men              women            hispanic          white      
##  Min.   :     42   Min.   :     43   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5546   1st Qu.:   5466   1st Qu.: 1.900   1st Qu.:65.60  
##  Median :  12826   Median :  12907   Median : 3.700   Median :84.60  
##  Mean   :  49565   Mean   :  51171   Mean   : 8.826   Mean   :77.28  
##  3rd Qu.:  33319   3rd Qu.:  34122   3rd Qu.: 9.000   3rd Qu.:93.30  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.879   Mean   : 1.766   Mean   : 1.258   Mean   : 0.08475  
##  3rd Qu.:10.175   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :35.30000  
##     citizen            income       income_per_cap     poverty    
##  Min.   :     80   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8254   1st Qu.: 38826   1st Qu.:20471   1st Qu.:12.0  
##  Median :  19434   Median : 45111   Median :23577   Median :16.0  
##  Mean   :  70804   Mean   : 46830   Mean   :24338   Mean   :16.7  
##  3rd Qu.:  50728   3rd Qu.: 52249   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.04   Mean   :18.26   Mean   :22.13  
##  3rd Qu.:29.48   3rd Qu.:34.40   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##   construction     production        drive          carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.20   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.53   1st Qu.:76.60   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.60   Median : 9.90  
##  Mean   :12.74   Mean   :15.82   Mean   :79.08   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.60   3rd Qu.:11.88  
##  Max.   :40.30   Max.   :55.60   Max.   :94.60   Max.   :29.90  
##     transit             walk         other_transp     work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.900   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.300   Median : 4.000  
##  Mean   : 0.9675   Mean   : 3.307   Mean   : 1.614   Mean   : 4.697  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.900   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.100   Max.   :37.200  
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :     62   Min.   :25.00   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4524   1st Qu.:70.90   1st Qu.:13.10  
##  Median :22.90   Median :  10644   Median :75.80   Median :16.10  
##  Mean   :23.15   Mean   :  46387   Mean   :74.44   Mean   :17.35  
##  3rd Qu.:26.60   3rd Qu.:  29254   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.921   Mean   :0.2915   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400

Based on the results from the summary tables for each of the variables, there seems to be no unusual values and nothing needs to be done to deal with them.

Data Manipulation and Insights

5. How many counties have more women than men?

count(filter(data, women > men))
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1985

This code chunk counts the number of rows where there are more women than men. There are 1985 counties that have more women than men.

6. How many counties have an unemployment rate lower than 10%?

count(filter(data, unemployment < 0.10))
## # A tibble: 1 × 1
##       n
##   <int>
## 1     4

There are 4 counties with an unemployment rate less than 10%.

7. What are the top 10 counties with the highest mean commute? Show the census ID, county

name, state, and the mean_commute in your final answer (sorted by mean_commute).

data %>%
  select(census_id, state, mean_commute) %>%
  arrange(desc(mean_commute)) %>%
  top_n(10)
## # A tibble: 10 × 3
##    census_id state         mean_commute
##        <dbl> <chr>                <dbl>
##  1     42103 Pennsylvania          44  
##  2     36005 New York              43  
##  3     24017 Maryland              42.8
##  4     51187 Virginia              42.7
##  5     36081 New York              42.6
##  6     36085 New York              42.6
##  7     51193 Virginia              42.5
##  8      8093 Colorado              42.4
##  9     36047 New York              41.7
## 10     54015 West Virginia         41.4

8. Create a new variable that calculates the percentage of women for each county and then find

the top 10 counties with the lowest percentages. Show the census ID, county name, state, and the percentage in your final answer (sorted by ascending percentage).

data_2 <- data %>%
  mutate(percent_women = (women / total_pop) * 100) %>%
  select(census_id, county, state, percent_women) %>%
  arrange(percent_women) %>%
  top_n(10)

data_2
## # A tibble: 10 × 4
##    census_id county        state       percent_women
##        <dbl> <chr>         <chr>               <dbl>
##  1     29117 Livingston    Missouri             54.9
##  2     35011 De Baca       New Mexico           55.1
##  3     51790 Staunton city Virginia             55.1
##  4     48137 Edwards       Texas                55.2
##  5     51091 Highland      Virginia             55.3
##  6     51620 Franklin city Virginia             55.5
##  7     28125 Sharkey       Mississippi          55.5
##  8      1119 Sumter        Alabama              55.7
##  9     13235 Pulaski       Georgia              58.0
## 10     51720 Norton city   Virginia             59.4

9. Create a new variable that calculates the sum of all race percentage variables (these columns

are the “hispanic”, “white”, “black”, “native”, “asian”, and “pacific” variables).

data_3 <- data %>% 
  mutate(race_sum = rowSums(across(c(hispanic, white, black, native, asian, pacific)), na.rm = TRUE))

a. What are the top 10 counties with the lowest sum of these race percentage

variables?

data_3 %>%
  arrange() %>%
  top_n(10)
## # A tibble: 43 × 36
##    census_id state      county total_pop   men women hispanic white black native
##        <dbl> <chr>      <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl>
##  1      1065 Alabama    Hale       15256  7183  8073      1.1  40    58.2    0.3
##  2      1131 Alabama    Wilcox     11235  5376  5859      0.6  27.4  72      0  
##  3     13201 Georgia    Miller      5936  2837  3099      0.1  68.7  31.1    0.1
##  4     13307 Georgia    Webst…      2720  1370  1350      1    43.2  55.8    0  
##  5     20199 Kansas     Walla…      1595   789   806      3    96.9   0      0  
##  6     21031 Kentucky   Butler     12835  6450  6385      3.5  95.6   0.7    0.2
##  7     28021 Mississip… Claib…      9330  4329  5001      1.1  13.6  84.2    0.2
##  8     28125 Mississip… Shark…      4805  2139  2666      0.6  27.7  71.3    0  
##  9     30019 Montana    Danie…      1758   891   867      2.5  96.6   0      0.7
## 10     30069 Montana    Petro…       443   232   211      1.4  98.6   0      0  
## # ℹ 33 more rows
## # ℹ 26 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>, …

b. Which state, on average, has the lowest sum of these race percentage variables?

data %>%
  mutate(race_sum = rowSums(across(c(hispanic, white, black, native, asian, pacific)), na.rm = TRUE)) %>%
  group_by(state) %>%
  summarise(avg_race_sum = mean(race_sum, na.rm = TRUE)) %>%
  arrange(avg_race_sum) %>%
  top_n(1)
## # A tibble: 1 × 2
##   state       avg_race_sum
##   <chr>              <dbl>
## 1 Mississippi         99.2

###c. Do any counties have a sum greater than 100%?

data %>%
  mutate(race_sum = rowSums(across(c(hispanic, white, black, native, asian, pacific)), na.rm = TRUE)) %>%
  filter(race_sum > 100) %>%
  arrange(desc(race_sum)) %>% 
  select(census_id, county, state, race_sum)
## # A tibble: 5 × 4
##   census_id county  state    race_sum
##       <dbl> <chr>   <chr>       <dbl>
## 1     31073 Gosper  Nebraska     100.
## 2     31091 Hooker  Nebraska     100.
## 3     31125 Nance   Nebraska     100.
## 4     48017 Bailey  Texas        100.
## 5     48137 Edwards Texas        100.

###d. How many states have a sum that equals exactly to 100%?

data_3 %>% 
  filter(race_sum == 100) %>% 
  select(state)
## # A tibble: 38 × 1
##    state      
##    <chr>      
##  1 Alabama    
##  2 Alabama    
##  3 Georgia    
##  4 Georgia    
##  5 Kansas     
##  6 Kentucky   
##  7 Mississippi
##  8 Mississippi
##  9 Montana    
## 10 Montana    
## # ℹ 28 more rows

10. Using the carpool variable,

a. Use the function to create a new variable called carpool_rank

where the highest ranked county (rank = 1) is the county with the highest carpool value. Read the documentation carefully for the ranking function.

carpool_df <- data %>%
  mutate(carpool_rank = min_rank(desc(carpool)))

head(carpool_df, 5)
## # A tibble: 5 × 36
##   census_id state county total_pop   men women hispanic white black native asian
##       <dbl> <chr> <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1      1001 Alab… Autau…     55221 26745 28476      2.6  75.8  18.5    0.4   1  
## 2      1003 Alab… Baldw…    195121 95314 99807      4.5  83.1   9.5    0.6   0.7
## 3      1005 Alab… Barbo…     26932 14497 12435      4.6  46.2  46.7    0.2   0.4
## 4      1007 Alab… Bibb       22604 12073 10531      2.2  74.5  21.4    0.4   0.1
## 5      1009 Alab… Blount     57710 28512 29198      8.6  87.9   1.5    0.3   0.1
## # ℹ 25 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## #   family_work <dbl>, unemployment <dbl>, carpool_rank <int>

b. Find the 10 highest ranked counties for carpooling. Show the census ID, county

name, state, carpool value, and carpool_rank in your final answer.

carpool_df_top10 <- data %>%
  mutate(carpool_rank = min_rank(desc(carpool))) %>%
  arrange(carpool_rank) %>%
  select(census_id, county, state, carpool, carpool_rank) %>%
  top_n(10)

carpool_df_top10
## # A tibble: 11 × 5
##    census_id county      state        carpool carpool_rank
##        <dbl> <chr>       <chr>          <dbl>        <int>
##  1     46069 Hyde        South Dakota     2.8         3132
##  2     51720 Norton city Virginia         2.8         3132
##  3     30019 Daniels     Montana          2.6         3134
##  4     31057 Dundy       Nebraska         2.6         3134
##  5     13309 Wheeler     Georgia          2.3         3136
##  6     38029 Emmons      North Dakota     2.3         3136
##  7     36061 New York    New York         1.9         3138
##  8     31183 Wheeler     Nebraska         1.3         3139
##  9     48235 Irion       Texas            0.9         3140
## 10     48261 Kenedy      Texas            0           3141
## 11     48269 King        Texas            0           3141
#the arrange function is not arranging correctly on carpool_rank

c. Find the 10 lowest ranked counties for carpooling. Show the same variables in your

final answer.

carpool_df_bottom10 <- data %>%
  mutate(carpool_rank = min_rank(desc(carpool))) %>%
  arrange(desc(carpool_rank)) %>%
  select(census_id, county, state, carpool, carpool_rank) %>%
  top_n(10)

carpool_df_bottom10
## # A tibble: 11 × 5
##    census_id county      state        carpool carpool_rank
##        <dbl> <chr>       <chr>          <dbl>        <int>
##  1     48261 Kenedy      Texas            0           3141
##  2     48269 King        Texas            0           3141
##  3     48235 Irion       Texas            0.9         3140
##  4     31183 Wheeler     Nebraska         1.3         3139
##  5     36061 New York    New York         1.9         3138
##  6     13309 Wheeler     Georgia          2.3         3136
##  7     38029 Emmons      North Dakota     2.3         3136
##  8     30019 Daniels     Montana          2.6         3134
##  9     31057 Dundy       Nebraska         2.6         3134
## 10     46069 Hyde        South Dakota     2.8         3132
## 11     51720 Norton city Virginia         2.8         3132

d. On average, what state is the best ranked for carpooling?

carpool_df_by_state <- data %>%
  group_by(state) %>%
  summarise(avg_carpool = mean(carpool, na.rm = TRUE)) %>%
  mutate(avg_state_rank = min_rank(desc(avg_carpool))) %>%
  arrange(avg_state_rank) %>%
  select(state, avg_carpool, avg_state_rank) %>%
  top_n(1)

carpool_df_by_state
## # A tibble: 1 × 3
##   state                avg_carpool avg_state_rank
##   <chr>                      <dbl>          <int>
## 1 District of Columbia         5.7             51
# carpool_df_by_state[avg_state_rank == 1] why doesn't this work?
# arrange function is still not arranging correctly

e. What are the top 5 states for carpooling?

carpool_df_by_state_top5 <- data %>%
  group_by(state) %>%
  summarise(avg_carpool = mean(carpool, na.rm = TRUE)) %>%
  mutate(avg_state_rank = min_rank(desc(avg_carpool))) %>%
  arrange(avg_state_rank) %>%
  select(state, avg_carpool, avg_state_rank) %>%
  top_n(5)

carpool_df_by_state_top5
## # A tibble: 5 × 3
##   state                avg_carpool avg_state_rank
##   <chr>                      <dbl>          <int>
## 1 New Jersey                  8.10             47
## 2 Rhode Island                8                48
## 3 Connecticut                 7.94             49
## 4 Massachusetts               7.75             50
## 5 District of Columbia        5.7              51