We import the dataset using a tidyverse function. We observe that there are 3142 observations of 35 variables/columns.
library(tidyverse)
county <- read_csv("acs_2015_county_data_revised.csv")
dim(county)
## [1] 3142 35
Only the census id needs changing from a double to a character. All other variable types are appropriate.
glimpse(county)
## Observations: 3,142
## Variables: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015,...
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Al...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Bloun...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 2035...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, ...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852,...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0,...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 4...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5,...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6,...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3,...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7,...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2,...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, ...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4,...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3,...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1,...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1,...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8,...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9...
county$census_id <- as.character(county$census_id)
There is only one missing value for field ‘child_poverty’ and one missing value for field ‘income’. Because these are so few observations, out of 3000, I think omitting the field would be better than imputing a value into it. We successfully removed the two records with NA data from our set.
county <- drop_na(county)
#check our work, removed two records
sum(is.na(county))
## [1] 0
dim(county)
## [1] 3140 35
Unusual field is ‘employed’ since it does not seem to match its data dictionary definition of percentage employed, ages 16+. It is a number ranging from 166 to 4635465, with a mean of 46416. Because this is not a percentage and I do not know its meaning, I will probably not use it in my analysis unless I get new insight about it.
summary(county)
## census_id state county
## Length:3140 Length:3140 Length:3140
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## total_pop men women hispanic
## Min. : 267 Min. : 136 Min. : 131 Min. : 0.000
## 1st Qu.: 11036 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900
## Median : 25793 Median : 12838 Median : 12916 Median : 3.700
## Mean : 100801 Mean : 49597 Mean : 51204 Mean : 8.819
## 3rd Qu.: 67620 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000
## Max. :10038388 Max. :4945351 Max. :5093037 Max. :98.700
## white black native asian
## Min. : 0.90 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:65.67 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200
## Median :84.65 Median : 2.100 Median : 0.300 Median : 0.500
## Mean :77.31 Mean : 8.885 Mean : 1.763 Mean : 1.253
## 3rd Qu.:93.33 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200
## Max. :99.80 Max. :85.900 Max. :92.100 Max. :41.600
## pacific citizen income income_per_cap
## Min. : 0.00000 Min. : 199 Min. : 19328 Min. : 8292
## 1st Qu.: 0.00000 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470
## Median : 0.00000 Median : 19454 Median : 45095 Median :23575
## Mean : 0.07357 Mean : 70849 Mean : 46824 Mean :24331
## 3rd Qu.: 0.00000 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138
## Max. :11.10000 Max. :6046749 Max. :123453 Max. :65600
## poverty child_poverty professional service
## Min. : 1.4 Min. : 0.00 Min. :13.50 Min. : 5.00
## 1st Qu.:12.0 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90
## Median :16.0 Median :22.50 Median :30.00 Median :18.00
## Mean :16.7 Mean :23.29 Mean :31.05 Mean :18.25
## 3rd Qu.:20.3 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20
## Max. :53.3 Max. :72.30 Max. :74.00 Max. :36.60
## office construction production drive
## Min. : 4.10 Min. : 1.70 Min. : 0.00 Min. : 5.2
## 1st Qu.:20.20 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6
## Median :22.40 Median :12.20 Median :15.40 Median :80.6
## Mean :22.13 Mean :12.75 Mean :15.82 Mean :79.1
## 3rd Qu.:24.30 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6
## Max. :35.40 Max. :40.30 Max. :55.60 Max. :94.6
## carpool transit walk other_transp
## Min. : 0.00 Min. : 0.0000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 8.50 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90
## Median : 9.90 Median : 0.4000 Median : 2.400 Median : 1.30
## Mean :10.33 Mean : 0.9681 Mean : 3.294 Mean : 1.61
## 3rd Qu.:11.90 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90
## Max. :29.90 Max. :61.7000 Max. :71.200 Max. :39.10
## work_at_home mean_commute employed private_work
## Min. : 0.000 Min. : 4.90 Min. : 166 Min. :29.50
## 1st Qu.: 2.800 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90
## Median : 4.000 Median :22.90 Median : 10657 Median :75.85
## Mean : 4.694 Mean :23.15 Mean : 46416 Mean :74.45
## 3rd Qu.: 5.700 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80
## Max. :37.200 Max. :44.00 Max. :4635465 Max. :88.30
## public_work self_employed family_work unemployment
## Min. : 5.80 Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.:13.07 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median :16.10 Median : 6.900 Median :0.2000 Median : 7.500
## Mean :17.33 Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.:20.10 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :66.20 Max. :36.600 Max. :9.8000 Max. :29.400
1984 counties have more women than men.
county %>%
filter(women > men) %>%
nrow()
## [1] 1984
2419 counties have an unemployment rate less than 10%.
county %>%
filter(unemployment < 10.0) %>%
nrow()
## [1] 2419
Top 10 counties with the highest mean commute are Pike, Bronx, Charles, Warren, Queens, Richmond, Westmoreland, Park, Kings, and Clay.
county %>%
arrange(desc(mean_commute)) %>%
select(census_id, county, state, mean_commute) %>%
top_n(n=10)
## Selecting by mean_commute
## # A tibble: 10 x 4
## census_id county state mean_commute
## <chr> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
The 10 counties with lowest percentage of women are Forest, Bent, Sussex, Wheeler, Lassen, Concho, Chattachoochee, Aleutians East Borough, West Feliciana and Pershing.
county %>%
mutate(pct_women = women/total_pop) %>%
arrange(pct_women) %>%
select(census_id, county, state, pct_women) %>%
top_n(-10)
## Selecting by pct_women
## # A tibble: 10 x 4
## census_id county state pct_women
## <chr> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 0.268
## 2 8011 Bent Colorado 0.314
## 3 51183 Sussex Virginia 0.315
## 4 13309 Wheeler Georgia 0.321
## 5 6035 Lassen California 0.332
## 6 48095 Concho Texas 0.333
## 7 13053 Chattahoochee Georgia 0.334
## 8 2013 Aleutians East Borough Alaska 0.335
## 9 22125 West Feliciana Louisiana 0.336
## 10 32027 Pershing Nevada 0.337
Below shows the code for creating a new variable that calculates the sum of all race percentage variables. The variable is called pct_tot
.
county <- mutate(county, pct_tot = hispanic + white + black + native + asian + pacific)
The 10 counties with the lowest sum of these race percentage variables are Todd, Kusilvak Census Area, Conecuh, Dewey, Oglala Lakota, Crowley, Corson, Ziebach, Quitman, and Allendale.
county %>%
select(pct_tot, everything()) %>%
arrange(pct_tot) %>%
top_n(10)
## Selecting by unemployment
## # A tibble: 11 x 36
## pct_tot census_id state county total_pop men women hispanic white
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 89 46121 Sout~ Todd 9942 4862 5080 3.7 10
## 2 93.8 2158 Alas~ Kusil~ 7914 4200 3714 1 4.5
## 3 97.6 1035 Alab~ Conec~ 12865 6176 6689 1.6 51
## 4 98.2 46041 Sout~ Dewey 5579 2718 2861 0.9 21.7
## 5 98.4 46102 Sout~ Oglal~ 14153 6809 7344 1 4.8
## 6 98.4 8025 Colo~ Crowl~ 5551 3145 2406 30.2 61.1
## 7 98.9 46031 Sout~ Corson 4149 2123 2026 0.6 30.9
## 8 99 46137 Sout~ Zieba~ 2833 1391 1442 3.7 23.7
## 9 99.4 28119 Miss~ Quitm~ 7761 3751 4010 0.9 28.3
## 10 99.8 45005 Sout~ Allen~ 9838 5225 4613 2.8 23.2
## 11 99.9 28053 Miss~ Humph~ 8984 4278 4706 2.6 22
## # ... with 27 more variables: black <dbl>, native <dbl>, asian <dbl>,
## # pacific <dbl>, citizen <dbl>, income <dbl>, income_per_cap <dbl>,
## # poverty <dbl>, child_poverty <dbl>, professional <dbl>, service <dbl>,
## # office <dbl>, construction <dbl>, production <dbl>, drive <dbl>,
## # carpool <dbl>, transit <dbl>, walk <dbl>, other_transp <dbl>,
## # work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>
Mississippi, on average, has the lowest sum of these race percentage variables at 99.184 percent.
county %>%
select(pct_tot, everything()) %>%
group_by(state) %>%
summarise(avg = mean(pct_tot)) %>%
arrange(avg) %>%
top_n(1)
## Selecting by avg
## # A tibble: 1 x 2
## state avg
## <chr> <dbl>
## 1 Mississippi 99.2
Eleven counties have a sum greater than 100%.
county %>%
filter(pct_tot > 100) %>%
nrow()
## [1] 11
27 counties have a sum that equals exactly to 100%.
county %>%
filter(pct_tot == 100) %>%
nrow()
## [1] 27
A new variable called carpool_rank
has been created and shown below.
(county <- county %>%
mutate(carpool_rank = rank(desc(carpool), ties.method = "max")))
## # A tibble: 3,140 x 37
## census_id state county total_pop men women hispanic white black native
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ 55221 26745 28476 2.6 75.8 18.5 0.4
## 2 1003 Alab~ Baldw~ 195121 95314 99807 4.5 83.1 9.5 0.6
## 3 1005 Alab~ Barbo~ 26932 14497 12435 4.6 46.2 46.7 0.2
## 4 1007 Alab~ Bibb 22604 12073 10531 2.2 74.5 21.4 0.4
## 5 1009 Alab~ Blount 57710 28512 29198 8.6 87.9 1.5 0.3
## 6 1011 Alab~ Bullo~ 10678 5660 5018 4.4 22.2 70.7 1.2
## 7 1013 Alab~ Butler 20354 9502 10852 1.2 53.3 43.8 0.1
## 8 1015 Alab~ Calho~ 116648 56274 60374 3.5 73 20.3 0.2
## 9 1017 Alab~ Chamb~ 34079 16258 17821 0.4 57.3 40.3 0.2
## 10 1019 Alab~ Chero~ 26008 12975 13033 1.5 91.7 4.8 0.6
## # ... with 3,130 more rows, and 27 more variables: asian <dbl>,
## # pacific <dbl>, citizen <dbl>, income <dbl>, income_per_cap <dbl>,
## # poverty <dbl>, child_poverty <dbl>, professional <dbl>, service <dbl>,
## # office <dbl>, construction <dbl>, production <dbl>, drive <dbl>,
## # carpool <dbl>, transit <dbl>, walk <dbl>, other_transp <dbl>,
## # work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>, pct_tot <dbl>,
## # carpool_rank <int>
The 10 highest ranked counties for carpooling are Clay, LaGrange, Jenkins, Sevier, Seward, Cochran, Jim Hogg, Roberts, Holmes, and Powell. See below for details.
county %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10, wt=carpool_rank)
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
The 10 lowest ranked counties for carpooling are Hyde, Norton city, Daniels, Dundy, Wheeler (Georgia), Emmons, New York, Wheeler (Nebraska), Irion, and Kenedy. See below for more details regarding these counties.
county %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(10, wt=carpool_rank)
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 46069 Hyde South Dakota 2.8 3131
## 2 51720 Norton city Virginia 2.8 3131
## 3 30019 Daniels Montana 2.6 3133
## 4 31057 Dundy Nebraska 2.6 3133
## 5 13309 Wheeler Georgia 2.3 3135
## 6 38029 Emmons North Dakota 2.3 3135
## 7 36061 New York New York 1.9 3136
## 8 31183 Wheeler Nebraska 1.3 3137
## 9 48235 Irion Texas 0.9 3138
## 10 48261 Kenedy Texas 0 3140
## 11 48269 King Texas 0 3140
On average, Hawaii is the best ranked for carpooling.
county %>%
group_by(state) %>%
summarise(avgrank = mean(carpool_rank)) %>%
arrange(avgrank) %>%
top_n(-1)
## Selecting by avgrank
## # A tibble: 1 x 2
## state avgrank
## <chr> <dbl>
## 1 Hawaii 674.
# Hawaii
The top 5 states for carpooling are Hawaii, Arizona, Utah, Arkansas, and Alaska.
county %>%
group_by(state) %>%
summarise(avgrank = mean(carpool_rank)) %>%
arrange(avgrank) %>%
top_n(-5)
## Selecting by avgrank
## # A tibble: 5 x 2
## state avgrank
## <chr> <dbl>
## 1 Hawaii 674.
## 2 Arizona 1005.
## 3 Utah 1045.
## 4 Arkansas 1084.
## 5 Alaska 1108.