library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
# Importing data
census <- read_csv("D:/UC MSBANA/Data Wrangling with R/Week 4/homework3/acs_2015_county_data_revised.csv")
## Rows: 3142 Columns: 35
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Finding the structure of dataset
str(census)
## spec_tbl_df [3,142 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ census_id : num [1:3142] 1001 1003 1005 1007 1009 ...
## $ state : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ total_pop : num [1:3142] 55221 195121 26932 22604 57710 ...
## $ men : num [1:3142] 26745 95314 14497 12073 28512 ...
## $ women : num [1:3142] 28476 99807 12435 10531 29198 ...
## $ hispanic : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ...
## $ citizen : num [1:3142] 40725 147695 20714 17495 42345 ...
## $ income : num [1:3142] 51281 50254 32964 38678 45813 ...
## $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ...
## $ poverty : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : num [1:3142] 23986 85953 8597 8294 22189 ...
## $ private_work : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
## - attr(*, "spec")=
## .. cols(
## .. census_id = col_double(),
## .. state = col_character(),
## .. county = col_character(),
## .. total_pop = col_double(),
## .. men = col_double(),
## .. women = col_double(),
## .. hispanic = col_double(),
## .. white = col_double(),
## .. black = col_double(),
## .. native = col_double(),
## .. asian = col_double(),
## .. pacific = col_double(),
## .. citizen = col_double(),
## .. income = col_double(),
## .. income_per_cap = col_double(),
## .. poverty = col_double(),
## .. child_poverty = col_double(),
## .. professional = col_double(),
## .. service = col_double(),
## .. office = col_double(),
## .. construction = col_double(),
## .. production = col_double(),
## .. drive = col_double(),
## .. carpool = col_double(),
## .. transit = col_double(),
## .. walk = col_double(),
## .. other_transp = col_double(),
## .. work_at_home = col_double(),
## .. mean_commute = col_double(),
## .. employed = col_double(),
## .. private_work = col_double(),
## .. public_work = col_double(),
## .. self_employed = col_double(),
## .. family_work = col_double(),
## .. unemployment = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
state and county are of chr datatype.We may convert them to factor datatype because they represent categorical variables with levels as the state names and county names.#Converting the variable types to Factor
census$state <- as.factor(census$state)
census$county <- as.factor(census$county)
#Using glimpse() to display the modified dataset
library(dplyr)
census %>% glimpse()
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama, A~
## $ county <fct> Autauga, Baldwin, Barbour, Bibb, Blount, Bullock, Butle~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
# Finding the total number of missing values per each column
colSums(is.na(census))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
# Printing the rows with NA values
census[rowSums(is.na(census)) > 0, ]
## # A tibble: 2 x 35
## census_id state county total_pop men women hispanic white black native asian
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 15005 Hawa~ Kalaw~ 85 42 43 4.7 37.6 0 0 21.2
## 2 48301 Texas Loving 117 74 43 35 41 0 12.8 0
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>
income and child_poverty columns.Loving county in the Texas state is missing. Child Poverty for Kalawao county in Hawaii state is missing.income for Texas state and child_poverty for Hawaii state.# Summary statistics of 'income' for Texas and 'child_poverty' for Hawaii
census %>% filter(state == "Texas") %>% select(income) %>% summary()
## income
## Min. :22741
## 1st Qu.:39516
## Median :44674
## Mean :46746
## 3rd Qu.:51758
## Max. :89152
## NA's :1
census %>% filter(state == "Hawaii") %>% select(child_poverty) %>% summary()
## child_poverty
## Min. :12.50
## 1st Qu.:12.72
## Median :13.40
## Mean :17.23
## 3rd Qu.:17.90
## Max. :29.60
## NA's :1
# Boxplots and histograms for income, child poverty columns
boxplot(census[census$state == "Texas",]$income, main = "Boxplot of income in Texas")
boxplot(census[census$state == "Hawaii",]$child_poverty, main = "Box plot of Child poverty in Hawaii")
hist(census[census$state == "Texas",]$income, xlab = "Income(in dollars)",main = "Histogram of Income in Texas")
hist(census[census$state == "Hawaii",]$child_poverty, xlab = "Child Poverty Rate", main = "Histogram of Child Poverty in Hawaii")
# Imputation using median
census$income[which(is.na(census$income))] <- 44674
census$child_poverty[which(is.na(census$child_poverty))] <- 13.40
# Re-checking missing values
which(is.na(census$income))
## integer(0)
which(is.na(census$child_poverty))
## integer(0)
# Summary of all variables
summary(census)
## census_id state county total_pop
## Min. : 1001 Texas : 254 Washington: 31 Min. : 85
## 1st Qu.:18178 Georgia : 159 Jefferson : 26 1st Qu.: 11028
## Median :29176 Virginia: 133 Franklin : 25 Median : 25768
## Mean :30384 Kentucky: 120 Jackson : 24 Mean : 100737
## 3rd Qu.:45081 Missouri: 115 Lincoln : 24 3rd Qu.: 67552
## Max. :56045 Kansas : 105 Madison : 20 Max. :10038388
## (Other) :2256 (Other) :2992
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
##
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45095 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52249 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
##
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.28 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.48 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
##
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
From the summary statistics and data dictionary we observe that the column employed should have percentage values. But as per the data file, the column has total number of employed people of age greater than 16. We should convert these values to percentage of employed people by dividing with total_pop and multiplying using 100.
census$employed <- (census$employed/census$total_pop)*100
head(census$employed)
## [1] 43.43637 44.05113 31.92113 36.69262 38.44914 36.19592
# Total No. of counties with no. of women > no. of men
census %>% filter(women > men) %>% select(county) %>% nrow()
## [1] 1985
# Total no. of counties with unemployment < 10%
census %>% filter(unemployment < 10) %>% select(county) %>% nrow()
## [1] 2420
# Using top_n() function
census %>% top_n(10, mean_commute) %>% arrange(mean_commute)
## # A tibble: 10 x 35
## census_id state county total_pop men women hispanic white black native
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 54015 West V~ Clay 9141 4.58e3 4.56e3 0.1 97.9 0 0.4
## 2 36047 New Yo~ Kings 2595259 1.23e6 1.37e6 19.6 35.7 31.2 0.2
## 3 8093 Colora~ Park 16189 8.52e3 7.66e3 5.7 90.4 0 1.1
## 4 51193 Virgin~ Westmo~ 17557 8.58e3 8.98e3 6.1 63.5 27.9 0.2
## 5 36081 New Yo~ Queens 2301139 1.12e6 1.19e6 27.9 26.1 17.4 0.2
## 6 36085 New Yo~ Richmo~ 472481 2.29e5 2.44e5 17.8 62.8 9.6 0.1
## 7 51187 Virgin~ Warren 38481 1.93e4 1.92e4 3.9 87.9 4 0.2
## 8 24017 Maryla~ Charles 152754 7.37e4 7.90e4 5 45.4 41.8 0.7
## 9 36005 New Yo~ Bronx 1428357 6.72e5 7.56e5 54.6 10.3 29.6 0.2
## 10 42103 Pennsy~ Pike 56632 2.82e4 2.84e4 9.9 81.8 5.5 0.2
## # ... with 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>,
## # income <dbl>, income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>
# Using mutate() function
census %>% mutate(percent_women = (women/total_pop)*100) %>% select("census_id", "county","state", "percent_women") %>%
arrange(percent_women) %>% top_n(-10)
## Selecting by percent_women
## # A tibble: 10 x 4
## census_id county state percent_women
## <dbl> <fct> <fct> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
census %>%
mutate(race_pct = hispanic + white + black + native + asian + pacific) %>%
select(census_id,county,state,race_pct) %>%
arrange(race_pct) %>%
top_n(-10)
## Selecting by race_pct
## # A tibble: 10 x 4
## census_id county state race_pct
## <dbl> <fct> <fct> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
census %>%
mutate(race_pct = hispanic + white + black + native + asian + pacific) %>%
select(census_id,county,state,race_pct) %>%
arrange(race_pct) %>%
top_n(-10)
## Selecting by race_pct
## # A tibble: 10 x 4
## census_id county state race_pct
## <dbl> <fct> <fct> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
census %>%
group_by(state) %>%
mutate(race_pct = hispanic + white + black + native + asian + pacific) %>%
summarise(mean_race_pct = mean(race_pct)) %>%
arrange(mean_race_pct) %>%
top_n(-1)
## Selecting by mean_race_pct
## # A tibble: 1 x 2
## state mean_race_pct
## <fct> <dbl>
## 1 Hawaii 84
census %>%
mutate(race_pct = round((hispanic + white + black + native + asian + pacific),2)) %>%
select(census_id,county,state,race_pct) %>%
filter(race_pct > 100)
## # A tibble: 5 x 4
## census_id county state race_pct
## <dbl> <fct> <fct> <dbl>
## 1 31073 Gosper Nebraska 100.
## 2 31091 Hooker Nebraska 100.
## 3 31125 Nance Nebraska 100.
## 4 48017 Bailey Texas 100.
## 5 48137 Edwards Texas 100.
census %>%
mutate(race_pct = round((hispanic + white + black + native + asian + pacific),2)) %>%
select(census_id,county,state,race_pct) %>%
filter(race_pct == 100)
## # A tibble: 38 x 4
## census_id county state race_pct
## <dbl> <fct> <fct> <dbl>
## 1 1065 Hale Alabama 100
## 2 1131 Wilcox Alabama 100
## 3 13201 Miller Georgia 100
## 4 13307 Webster Georgia 100
## 5 20199 Wallace Kansas 100
## 6 21031 Butler Kentucky 100
## 7 28021 Claiborne Mississippi 100
## 8 28125 Sharkey Mississippi 100
## 9 30019 Daniels Montana 100
## 10 30069 Petroleum Montana 100
## # ... with 28 more rows
carpool variablecarpool_rank variablecensus %>%
mutate(carpool_rank = min_rank(-carpool)) %>%
select(census_id,county,state,carpool,carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-5)
## Selecting by carpool_rank
## # A tibble: 5 x 5
## census_id county state carpool carpool_rank
## <dbl> <fct> <fct> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
census %>%
mutate(carpool_rank = min_rank(-carpool)) %>%
select(census_id,county,state,carpool,carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10)
## Selecting by carpool_rank
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <dbl> <fct> <fct> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
census %>%
mutate(carpool_rank = min_rank(-carpool)) %>%
select(census_id,county,state,carpool,carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(10)
## Selecting by carpool_rank
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <dbl> <fct> <fct> <dbl> <int>
## 1 46069 Hyde South Dakota 2.8 3132
## 2 51720 Norton city Virginia 2.8 3132
## 3 30019 Daniels Montana 2.6 3134
## 4 31057 Dundy Nebraska 2.6 3134
## 5 13309 Wheeler Georgia 2.3 3136
## 6 38029 Emmons North Dakota 2.3 3136
## 7 36061 New York New York 1.9 3138
## 8 31183 Wheeler Nebraska 1.3 3139
## 9 48235 Irion Texas 0.9 3140
## 10 48261 Kenedy Texas 0 3141
## 11 48269 King Texas 0 3141
census %>%
group_by(state) %>%
summarise(mean_carpool = mean(carpool)) %>%
mutate(carpool_rank = min_rank(-mean_carpool)) %>%
arrange(carpool_rank) %>%
top_n(-1)
## Selecting by carpool_rank
## # A tibble: 1 x 3
## state mean_carpool carpool_rank
## <fct> <dbl> <int>
## 1 Alaska 12.1 1
Alaska is the best ranked state, on average, for carpooling with a percentage of 12.13488.
census %>%
group_by(state) %>%
summarise(mean_carpool = mean(carpool)) %>%
mutate(carpool_rank = min_rank(-mean_carpool)) %>%
arrange(carpool_rank) %>%
top_n(-5)
## Selecting by carpool_rank
## # A tibble: 5 x 3
## state mean_carpool carpool_rank
## <fct> <dbl> <int>
## 1 Alaska 12.1 1
## 2 Arkansas 11.9 2
## 3 Utah 11.9 3
## 4 Texas 11.8 4
## 5 Nevada 11.7 5
Alaska, Arkansas, Utah, Texas, Nevada are the top 5 states for carpooling.