#Using the library tidyverse
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#reading the csv file
county_data <- read_csv("acs_2015_county_data_revised.csv")
## Rows: 3142 Columns: 35
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#converting the data frame into tibbles
as_tibble(county_data)
## # A tibble: 3,142 x 35
## census_id state county total_pop men women hispanic white black native
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alabama Autauga 55221 26745 28476 2.6 75.8 18.5 0.4
## 2 1003 Alabama Baldwin 195121 95314 99807 4.5 83.1 9.5 0.6
## 3 1005 Alabama Barbour 26932 14497 12435 4.6 46.2 46.7 0.2
## 4 1007 Alabama Bibb 22604 12073 10531 2.2 74.5 21.4 0.4
## 5 1009 Alabama Blount 57710 28512 29198 8.6 87.9 1.5 0.3
## 6 1011 Alabama Bullock 10678 5660 5018 4.4 22.2 70.7 1.2
## 7 1013 Alabama Butler 20354 9502 10852 1.2 53.3 43.8 0.1
## 8 1015 Alabama Calhoun 116648 56274 60374 3.5 73 20.3 0.2
## 9 1017 Alabama Chambers 34079 16258 17821 0.4 57.3 40.3 0.2
## 10 1019 Alabama Cherokee 26008 12975 13033 1.5 91.7 4.8 0.6
## # ... with 3,132 more rows, and 25 more variables: asian <dbl>, pacific <dbl>,
## # citizen <dbl>, income <dbl>, income_per_cap <dbl>, poverty <dbl>,
## # child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## # construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## # transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## # self_employed <dbl>, family_work <dbl>, unemployment <dbl>
#Displaying the number of records in the tibble
dim(county_data)
## [1] 3142 35
There are 3142 rows and 35 columns in acs_2015_county_data_revised.csv data set.
#examining the structure of the data set imported
str(county_data)
## spec_tbl_df [3,142 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ census_id : num [1:3142] 1001 1003 1005 1007 1009 ...
## $ state : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ total_pop : num [1:3142] 55221 195121 26932 22604 57710 ...
## $ men : num [1:3142] 26745 95314 14497 12073 28512 ...
## $ women : num [1:3142] 28476 99807 12435 10531 29198 ...
## $ hispanic : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ...
## $ citizen : num [1:3142] 40725 147695 20714 17495 42345 ...
## $ income : num [1:3142] 51281 50254 32964 38678 45813 ...
## $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ...
## $ poverty : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : num [1:3142] 23986 85953 8597 8294 22189 ...
## $ private_work : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
## - attr(*, "spec")=
## .. cols(
## .. census_id = col_double(),
## .. state = col_character(),
## .. county = col_character(),
## .. total_pop = col_double(),
## .. men = col_double(),
## .. women = col_double(),
## .. hispanic = col_double(),
## .. white = col_double(),
## .. black = col_double(),
## .. native = col_double(),
## .. asian = col_double(),
## .. pacific = col_double(),
## .. citizen = col_double(),
## .. income = col_double(),
## .. income_per_cap = col_double(),
## .. poverty = col_double(),
## .. child_poverty = col_double(),
## .. professional = col_double(),
## .. service = col_double(),
## .. office = col_double(),
## .. construction = col_double(),
## .. production = col_double(),
## .. drive = col_double(),
## .. carpool = col_double(),
## .. transit = col_double(),
## .. walk = col_double(),
## .. other_transp = col_double(),
## .. work_at_home = col_double(),
## .. mean_commute = col_double(),
## .. employed = col_double(),
## .. private_work = col_double(),
## .. public_work = col_double(),
## .. self_employed = col_double(),
## .. family_work = col_double(),
## .. unemployment = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
The census_id is numeric in the data set, changing it to character.
#changing the census_id from num to chr
county_data$census_id <- as.character(county_data$census_id)
#Using glimpse() to see the changes
glimpse(county_data)
## Rows: 3,142
## Columns: 35
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013",~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
#checking for missing values in the data set imported
colSums(is.na(county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
Variables income and child_poverty includes 1 missing value each.
Removing the entire obervations due to a missing value in income or child_poverty does not seem appropriate to me because these observations include other valuable demographic information.
I would proceed with imputing the values. Plotting the below graphs to observe the skewness of the variable income and child_poverty.
#plotting histogram for income
county_data %>% ggplot(mapping = aes(x=income)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
#plotting histogram for child_poverty
county_data %>% ggplot(mapping = aes(x=child_poverty)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
From the above two graphs I came to conclusion of imputing missing values with median values, since income and child_poverty are right skewed.
#imputing the missing values with median for income
county_data$income[is.na(county_data$income)] <- median(county_data$income, na.rm=TRUE)
#imputing the missing values with median for child_poverty
county_data$child_poverty[is.na(county_data$child_poverty)] <- median(county_data$child_poverty, na.rm=TRUE)
#checking for missing values
colSums(is.na(county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
The data set now has 0 missing values.
summary(county_data)
## census_id state county total_pop
## Length:3142 Length:3142 Length:3142 Min. : 85
## Class :character Class :character Class :character 1st Qu.: 11028
## Mode :character Mode :character Mode :character Median : 25768
## Mean : 100737
## 3rd Qu.: 67552
## Max. :10038388
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52249 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.48 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
Unusual field is employed since it does not seem to match its data dictionary definition of percentage employed, ages 16+. It is a number ranging from 62 to 4635465, with a mean of 46416. I will create a new column with the percentage formula.
#adding new column to the data
county_data <- county_data %>% mutate(employed_percent = employed/total_pop*100)
#checking for new column added to the data set
glimpse(county_data)
## Rows: 3,142
## Columns: 36
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "B~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 116~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274,~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3,~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 8861~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 4170~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 2137~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6,~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2,~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3,~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5,~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3,~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, ~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4,~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1,~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1,~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 1~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1,~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1,~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9,~
## $ employed_percent <dbl> 43.43637, 44.05113, 31.92113, 36.69262, 38.44914, 36.~
#checking for counties having more women than men
county_data %>%
filter(women > men) %>%
nrow()
## [1] 1985
There are 1985 counties having more women than men.
#checking for counties have an unemployment rate lower than 10%
county_data %>%
filter(unemployment < 10.0) %>%
nrow()
## [1] 2420
There are 2,420 counties with unemployment rate lower than 10%.
#checking for top 10 counties with the highest mean commute
county_data %>%
arrange(desc(mean_commute)) %>%
select(census_id, county, state, mean_commute) %>%
top_n(n=10, wt=mean_commute)
## # A tibble: 10 x 4
## census_id county state mean_commute
## <chr> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
Pike in Pennsylvania is the county with the highest mean commmute, followed by Bronx, Charles, Warren, Queens, Richmond, Westmoreland, Park, Kings, and Clay.
#Creating a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages
county_data %>% mutate(women_percent = women/total_pop*100) %>%
arrange(women_percent) %>%
select(census_id, county, state, women_percent) %>%
top_n(n = -10, wt = women_percent)
## # A tibble: 10 x 4
## census_id county state women_percent
## <chr> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
The new variable women_percent is created to find the percentage of women for each county. Forest in Pennsylvania is the county with the lowest percentage of women, followed by Bent, Sussex, Wheeler, Lassen, Concho, Chattahoochee, Aleutians East Borough, West Feliciana, and Pershing.
#creating a new variable to calculate the sum
county_data <- mutate(county_data, total_race_percent = hispanic + white + black + native + asian + pacific)
#checking for top 10 counties with the lowest total_race_percent
county_data %>%
select(total_race_percent, everything()) %>%
arrange(total_race_percent) %>%
top_n(n=-10, wt=total_race_percent)
## # A tibble: 10 x 37
## total_race_percent census_id state county total_pop men women hispanic
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 76.4 15001 Hawaii Hawaii 191482 95939 95543 12.2
## 2 79.2 15009 Hawaii Maui 160863 80790 80073 10.8
## 3 79.7 40097 Oklahoma Mayes 41007 20370 20637 3.2
## 4 81.5 15003 Hawaii Honol~ 984178 498129 486049 9.2
## 5 82.8 40123 Oklahoma Ponto~ 38055 18595 19460 4.7
## 6 83 47061 Tennessee Grundy 13524 6687 6837 0.5
## 7 83.4 2282 Alaska Yakut~ 643 367 276 2.6
## 8 84 40069 Oklahoma Johns~ 11022 5406 5616 4.5
## 9 84.1 15007 Hawaii Kauai 69691 34971 34720 10.5
## 10 85.1 40003 Oklahoma Alfal~ 5755 3454 2301 5
## # ... with 29 more variables: white <dbl>, black <dbl>, native <dbl>,
## # asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>, ...
Hawaii is the county with the lowest sum of race percentage, followed by Maui, Mayes, Honolulu, Pontotoc, Grundy, Yakutat City and Borough, Johnston, Kauai, and Alfalfa.
#checking for state on avg having the lowest total_race_percent
county_data %>% select(total_race_percent, everything()) %>%
group_by(state) %>%
summarize(avg_sum_race = mean(total_race_percent)) %>%
arrange(avg_sum_race) %>%
top_n(n = -1, wt = avg_sum_race)
## # A tibble: 1 x 2
## state avg_sum_race
## <chr> <dbl>
## 1 Hawaii 84
Hawaii, on average, have the lowest sum of race percentage of 84%.
#checking for counties having a sum greater than 100%
county_data %>% mutate(percent_gt100 = total_race_percent > 100) %>%
filter(percent_gt100 == TRUE) %>%
select(census_id, county, state, total_race_percent, percent_gt100)
## # A tibble: 11 x 5
## census_id county state total_race_percent percent_gt100
## <chr> <chr> <chr> <dbl> <lgl>
## 1 28021 Claiborne Mississippi 100 TRUE
## 2 31073 Gosper Nebraska 100. TRUE
## 3 31091 Hooker Nebraska 100. TRUE
## 4 31125 Nance Nebraska 100. TRUE
## 5 48017 Bailey Texas 100. TRUE
## 6 48131 Duval Texas 100 TRUE
## 7 48137 Edwards Texas 100. TRUE
## 8 48261 Kenedy Texas 100 TRUE
## 9 48263 Kent Texas 100 TRUE
## 10 48377 Presidio Texas 100 TRUE
## 11 49001 Beaver Utah 100 TRUE
There are 11 counties having a sum greater than 100% - Claiborne, Gosper, Hooker, Nance, Bailey, Duval, Edwards, Kenedy, Kent, Presidio, and Beaver.
#checking for states having a sum that equals exactly to 100%
county_data %>% mutate(percent_100 = (total_race_percent == 100)) %>%
filter(percent_100 == TRUE) %>%
select(census_id, state, total_race_percent, percent_100) %>%
distinct(state)
## # A tibble: 13 x 1
## state
## <chr>
## 1 Alabama
## 2 Georgia
## 3 Kansas
## 4 Kentucky
## 5 Mississippi
## 6 Montana
## 7 Nebraska
## 8 New Mexico
## 9 North Carolina
## 10 North Dakota
## 11 South Dakota
## 12 Texas
## 13 West Virginia
There are 13 states having a sum that equals exactly to 100%.
#creating variable carpool_rank
county_data$carpool_rank <- min_rank(desc(county_data$carpool))
#checking for newly created variable
glimpse(county_data)
## Rows: 3,142
## Columns: 38
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", ~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, ~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.~
## $ employed_percent <dbl> 43.43637, 44.05113, 31.92113, 36.69262, 38.44914, 3~
## $ total_race_percent <dbl> 98.3, 98.4, 98.1, 98.6, 98.4, 98.7, 98.8, 97.9, 99.~
## $ carpool_rank <int> 2157, 2157, 1103, 391, 986, 204, 621, 1811, 759, 69~
#finding the 10 highest ranked counties for carpooling
county_data %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10, wt = carpool_rank)
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
The 10 highest ranked counties for carpooling are Clay, LaGrange, Jenkins, Sevier, Seward, Cochran, Jim Hogg, Roberts, Holmes, and Powell.
#Finding the 10 lowest ranked counties for carpooling
county_data %>% arrange(desc(carpool_rank)) %>%
select(census_id , county, state, carpool, carpool_rank) %>%
top_n(n=10, wt=carpool_rank)
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3141
## 2 48269 King Texas 0 3141
## 3 48235 Irion Texas 0.9 3140
## 4 31183 Wheeler Nebraska 1.3 3139
## 5 36061 New York New York 1.9 3138
## 6 13309 Wheeler Georgia 2.3 3136
## 7 38029 Emmons North Dakota 2.3 3136
## 8 30019 Daniels Montana 2.6 3134
## 9 31057 Dundy Nebraska 2.6 3134
## 10 46069 Hyde South Dakota 2.8 3132
## 11 51720 Norton city Virginia 2.8 3132
The 10 lowest ranked counties for carpooling are Kenedy, King, Irion, Wheeler (Nebraska), New York, Wheeler (Georgia), Emmons, Daniels, Dundy, Hyde, and Norton city
#checking for state ranked for best carpooling
county_data %>% group_by(state)%>%
summarize(avg_carpool = mean(carpool))%>%
mutate(avg_carpool_state = min_rank(desc(avg_carpool)))%>%
arrange(avg_carpool_state)%>%
top_n(n = -1, wt = avg_carpool_state)
## # A tibble: 1 x 3
## state avg_carpool avg_carpool_state
## <chr> <dbl> <int>
## 1 Alaska 12.1 1
On average, Alaska is the best ranked state for carpooling.
#checking for top 5 states for carpooling
county_data %>% group_by(state)%>%
summarize(avg_carpool = mean(carpool))%>%
arrange(desc(avg_carpool))%>%
top_n(5)
## Selecting by avg_carpool
## # A tibble: 5 x 2
## state avg_carpool
## <chr> <dbl>
## 1 Alaska 12.1
## 2 Arkansas 11.9
## 3 Utah 11.9
## 4 Texas 11.8
## 5 Nevada 11.7
The top 5 states for carpooling are Alaska, Arkansas, Utah, Texas, and Nevada.