This note is to replicate an online post

Census data the easy(er) way

You can use the acs package by Ezra Glenn to download tabular data.

Choose variables

Choose your region and table

1. Set up the packages

library(tidyverse)
library(acs)
library(stringr) # to pad fips codes
library(openintro) # Use this as an example to follow when organizing raw data imported from Census.Bureau

2. Identify rural areas

Using population density

# Create a new variable, rural
countyComplete$rural <- ifelse(countyComplete$density < 1000, "rural", "urban")
countyComplete$rural <- factor(countyComplete$rural)

Using Urban and Rural variable in American Community Survey: rural population / total population

3. Get the tabular data (acs)

In order to do this you will need an API key from the US Census. Go to this site to request one (takes a minute or two) and then use the api.key.install function in the acs package to use the key.

api.key.install(key="e9b35c58fa3fa05101c9afa608b44263b9d63a88")
# create a geographic set to grab tabular data (acs)
geo<-geo.make(state=state,
              county="*")

# !!!! important note -- the package has not been updated to 2013
# data so I'm using the five year span that ends in 2012

income<-acs.fetch(endyear = 2013, span = 5, geography = geo,
                table.number = "B19001", col.names = "pretty")

# use of col.names = "pretty" above gives the full column definitions
# if you want Census variable IDs use col.names="auto". Here are the
# variables we want with pretty and auto results.
#"Household Income: Total:" ("B19001_001")
#"Household Income: $200,000 or more" ("B19001_017")


# the resulting "income" object is not a data.frame it's a list
# to see what's available

glimpse(income)
## Formal class 'acs' [package "acs"] with 9 slots
##   ..@ endyear       : int 2013
##   ..@ span          : int 5
##   ..@ geography     :'data.frame':   3221 obs. of  3 variables:
##   .. ..$ NAME  : chr [1:3221] "Autauga County, Alabama" "Baldwin County, Alabama" "Barbour County, Alabama" "Bibb County, Alabama" ...
##   .. ..$ state : int [1:3221] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ county: chr [1:3221] "001" "003" "005" "007" ...
##   ..@ acs.colnames  : chr [1:17] "Household Income: Total:" "Household Income: Less than $10,000" "Household Income: $10,000 to $14,999" "Household Income: $15,000 to $19,999" ...
##   ..@ modified      : logi FALSE
##   ..@ acs.units     : Factor w/ 5 levels "count","dollars",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ currency.year : int 2013
##   ..@ estimate      : num [1:3221, 1:17] 20071 73283 9200 7091 21108 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   ..@ standard.error: num [1:3221, 1:17] 235 626 200 180 256 ...
##   .. ..- attr(*, "dimnames")=List of 2

names(attributes(income))
##  [1] "endyear"        "span"           "acs.units"      "currency.year" 
##  [5] "modified"       "geography"      "acs.colnames"   "estimate"      
##  [9] "standard.error" "class"
attr(income, "acs.colnames")
##  [1] "Household Income: Total:"              
##  [2] "Household Income: Less than $10,000"   
##  [3] "Household Income: $10,000 to $14,999"  
##  [4] "Household Income: $15,000 to $19,999"  
##  [5] "Household Income: $20,000 to $24,999"  
##  [6] "Household Income: $25,000 to $29,999"  
##  [7] "Household Income: $30,000 to $34,999"  
##  [8] "Household Income: $35,000 to $39,999"  
##  [9] "Household Income: $40,000 to $44,999"  
## [10] "Household Income: $45,000 to $49,999"  
## [11] "Household Income: $50,000 to $59,999"  
## [12] "Household Income: $60,000 to $74,999"  
## [13] "Household Income: $75,000 to $99,999"  
## [14] "Household Income: $100,000 to $124,999"
## [15] "Household Income: $125,000 to $149,999"
## [16] "Household Income: $150,000 to $199,999"
## [17] "Household Income: $200,000 or more"

# convert to a data.frame for merging
income_df <- data.frame(paste0(str_pad(income@geography$state, 2, "left", pad="0"), 
                             str_pad(income@geography$county, 3, "left", pad="0")), 
                        income@estimate[,c("Household Income: Total:","Household Income: $200,000 or more")], 
                        stringsAsFactors = FALSE)
income_df <- income_df[ ,1:3]
rownames(income_df)<-1:nrow(income_df)
names(income_df)<-c("GEOID", "total", "over_200")

glimpse(income_df)
## Observations: 3,221
## Variables: 3
## $ GEOID    <chr> "01001", "01003", "01005", "01007", "01009", "01011",...
## $ total    <dbl> 20071, 73283, 9200, 7091, 21108, 3741, 8235, 45196, 1...
## $ over_200 <dbl> 378, 2637, 59, 41, 194, 85, 32, 782, 191, 156, 236, 9...
head(income_df)
##   GEOID total over_200
## 1 01001 20071      378
## 2 01003 73283     2637
## 3 01005  9200       59
## 4 01007  7091       41
## 5 01009 21108      194
## 6 01011  3741       85

1. Set up the packages

library(tidyverse)
library(acs)
library(stringr) # to pad fips codes
library(openintro) # Use this as an example to follow when organizing raw data imported from Census.Bureau

2. Choose variables

  • Performance variables: median income, migration pattern, population growth
    • median_household_income
    • no_move_in_one_plus_year
    • poverty
    • percent_change_private_nonfarm_employment
  • fixed variables (in the short term)
    • pop2010
    • density
    • age_over_65
    • white
    • bachelors
    • mean_work_travel
    • median_val_owner_occupied
    • accommodation_food_service
  • controllable variables: expenditure on education, tax policies, expenditure on infrastructure

Select variables to be used for clustering (ones that are fixed in the short term) The goal is to provide a role model for towns: the town is very similar to us. Why can’t we be like them?! As an experiment, let’s start with pop2010, density, age_over_65, white, bachelors.

3. Identify rural areas

Using population density

# Create a new variable, rural
countyComplete$rural <- ifelse(countyComplete$density < 300, "rural", "urban") 
#300 excludes three less rural counties in southern NH: justify
countyComplete$rural <- factor(countyComplete$rural)
str(countyComplete)
## 'data.frame':    3143 obs. of  54 variables:
##  $ state                                    : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ name                                     : Factor w/ 1877 levels "Abbeville County",..: 83 90 101 151 166 227 237 250 298 320 ...
##  $ FIPS                                     : num  1001 1003 1005 1007 1009 ...
##  $ pop2010                                  : num  54571 182265 27457 22915 57322 ...
##  $ pop2000                                  : num  43671 140415 29038 20826 51024 ...
##  $ age_under_5                              : num  6.6 6.1 6.2 6 6.3 6.8 6.5 6.1 5.7 5.3 ...
##  $ age_under_18                             : num  26.8 23 21.9 22.7 24.6 22.3 24.1 22.9 22.5 21.4 ...
##  $ age_over_65                              : num  12 16.8 14.2 12.7 14.7 13.5 16.7 14.3 16.7 17.9 ...
##  $ female                                   : num  51.3 51.1 46.9 46.3 50.5 45.8 53 51.8 52.2 50.4 ...
##  $ white                                    : num  78.5 85.7 48 75.8 92.6 23 54.4 74.9 58.8 92.7 ...
##  $ black                                    : num  17.7 9.4 46.9 22 1.3 70.2 43.4 20.6 38.7 4.6 ...
##  $ native                                   : num  0.4 0.7 0.4 0.3 0.5 0.2 0.3 0.5 0.2 0.5 ...
##  $ asian                                    : num  0.9 0.7 0.4 0.1 0.2 0.2 0.8 0.7 0.5 0.2 ...
##  $ pac_isl                                  : num  NA NA NA NA NA NA 0 0.1 0 0 ...
##  $ two_plus_races                           : num  1.6 1.5 0.9 0.9 1.2 0.8 0.8 1.7 1.1 1.5 ...
##  $ hispanic                                 : num  2.4 4.4 5.1 1.8 8.1 7.1 0.9 3.3 1.6 1.2 ...
##  $ white_not_hispanic                       : num  77.2 83.5 46.8 75 88.9 21.9 54.1 73.6 58.1 92.1 ...
##  $ no_move_in_one_plus_year                 : num  86.3 83 83 90.5 87.2 88.5 92.8 82.9 86.2 88.1 ...
##  $ foreign_born                             : num  2 3.6 2.8 0.7 4.7 1.1 1.1 2.5 0.9 0.5 ...
##  $ foreign_spoken_at_home                   : num  3.7 5.5 4.7 1.5 7.2 3.8 1.6 4.5 1.6 1.4 ...
##  $ hs_grad                                  : num  85.3 87.6 71.9 74.5 74.7 74.7 74.8 78.5 71.8 73.4 ...
##  $ bachelors                                : num  21.7 26.8 13.5 10 12.5 12 11 16.1 10.8 10.5 ...
##  $ veterans                                 : num  5817 20396 2327 1883 4072 ...
##  $ mean_work_travel                         : num  25.1 25.8 23.8 28.3 33.2 28.1 25.1 22.1 23.6 26.2 ...
##  $ housing_units                            : num  22135 104061 11829 8981 23887 ...
##  $ home_ownership                           : num  77.5 76.7 68 82.9 82 76.9 69 70.7 71.4 77.5 ...
##  $ housing_multi_unit                       : num  7.2 22.6 11.1 6.6 3.7 9.9 13.7 14.3 8.7 4.3 ...
##  $ median_val_owner_occupied                : num  133900 177200 88200 81200 113700 ...
##  $ households                               : num  19718 69476 9795 7441 20605 ...
##  $ persons_per_household                    : num  2.7 2.5 2.52 3.02 2.73 2.85 2.58 2.46 2.51 2.22 ...
##  $ per_capita_income                        : num  24568 26469 15875 19918 21070 ...
##  $ median_household_income                  : num  53255 50147 33219 41770 45549 ...
##  $ poverty                                  : num  10.6 12.2 25 12.6 13.4 25.3 25 19.5 20.3 17.6 ...
##  $ private_nonfarm_establishments           : num  877 4812 522 318 749 ...
##  $ private_nonfarm_employment               : num  10628 52233 7990 2927 6968 ...
##  $ percent_change_private_nonfarm_employment: num  16.6 17.4 -27 -14 -11.4 -18.5 2.1 -5.6 -45.8 5.4 ...
##  $ nonemployment_establishments             : num  2971 14175 1527 1192 3501 ...
##  $ firms                                    : num  4067 19035 1667 1385 4458 ...
##  $ black_owned_firms                        : num  15.2 2.7 NA 14.9 NA NA NA 7.2 NA NA ...
##  $ native_owned_firms                       : num  NA 0.4 NA NA NA NA NA NA NA NA ...
##  $ asian_owned_firms                        : num  1.3 1 NA NA NA NA 3.3 1.6 NA NA ...
##  $ pac_isl_owned_firms                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hispanic_owned_firms                     : num  0.7 1.3 NA NA NA NA NA 0.5 NA NA ...
##  $ women_owned_firms                        : num  31.7 27.3 27 NA 23.2 38.8 NA 24.7 29.3 14.5 ...
##  $ manufacturer_shipments_2007              : num  NA 1410273 NA 0 341544 ...
##  $ mercent_whole_sales_2007                 : num  NA NA NA NA NA ...
##  $ sales                                    : num  598175 2966489 188337 124707 319700 ...
##  $ sales_per_capita                         : num  12003 17166 6334 5804 5622 ...
##  $ accommodation_food_service               : num  88157 436955 NA 10757 20941 ...
##  $ building_permits                         : num  191 696 10 8 18 1 3 107 10 6 ...
##  $ fed_spending                             : num  331142 1119082 240308 163201 294114 ...
##  $ area                                     : num  594 1590 885 623 645 ...
##  $ density                                  : num  91.8 114.6 31 36.8 88.9 ...
##  $ rural                                    : Factor w/ 2 levels "rural","urban": 1 1 1 1 1 1 1 1 1 1 ...

countyRural <- 
  countyComplete %>% 
  filter(rural == "rural") %>%
  select("state", "name", "FIPS", "pop2010", "density", "age_over_65", "bachelors", 
         "median_household_income", "no_move_in_one_plus_year", "poverty")
str(countyRural)
## 'data.frame':    2747 obs. of  10 variables:
##  $ state                   : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ name                    : Factor w/ 1877 levels "Abbeville County",..: 83 90 101 151 166 227 237 250 298 320 ...
##  $ FIPS                    : num  1001 1003 1005 1007 1009 ...
##  $ pop2010                 : num  54571 182265 27457 22915 57322 ...
##  $ density                 : num  91.8 114.6 31 36.8 88.9 ...
##  $ age_over_65             : num  12 16.8 14.2 12.7 14.7 13.5 16.7 14.3 16.7 17.9 ...
##  $ bachelors               : num  21.7 26.8 13.5 10 12.5 12 11 16.1 10.8 10.5 ...
##  $ median_household_income : num  53255 50147 33219 41770 45549 ...
##  $ no_move_in_one_plus_year: num  86.3 83 83 90.5 87.2 88.5 92.8 82.9 86.2 88.1 ...
##  $ poverty                 : num  10.6 12.2 25 12.6 13.4 25.3 25 19.5 20.3 17.6 ...

4. Explore the data

head(countyRural)
##     state           name FIPS pop2010 density age_over_65 bachelors
## 1 Alabama Autauga County 1001   54571    91.8        12.0      21.7
## 2 Alabama Baldwin County 1003  182265   114.6        16.8      26.8
## 3 Alabama Barbour County 1005   27457    31.0        14.2      13.5
## 4 Alabama    Bibb County 1007   22915    36.8        12.7      10.0
## 5 Alabama  Blount County 1009   57322    88.9        14.7      12.5
## 6 Alabama Bullock County 1011   10914    17.5        13.5      12.0
##   median_household_income no_move_in_one_plus_year poverty
## 1                   53255                     86.3    10.6
## 2                   50147                     83.0    12.2
## 3                   33219                     83.0    25.0
## 4                   41770                     90.5    12.6
## 5                   45549                     87.2    13.4
## 6                   31602                     88.5    25.3
summary(countyRural)
##       state                     name           FIPS      
##  Texas   : 235   Washington County:  24   Min.   : 1001  
##  Georgia : 134   Franklin County  :  23   1st Qu.:19022  
##  Kentucky: 113   Lincoln County   :  23   Median :29133  
##  Missouri: 108   Jackson County   :  22   Mean   :30201  
##  Kansas  : 101   Jefferson County :  21   3rd Qu.:45068  
##  Iowa    :  97   Madison County   :  17   Max.   :56045  
##  (Other) :1959   (Other)          :2617                  
##     pop2010           density        age_over_65      bachelors    
##  Min.   :     82   Min.   :  0.00   Min.   : 3.50   Min.   : 3.70  
##  1st Qu.:   9982   1st Qu.: 13.80   1st Qu.:13.70   1st Qu.:12.60  
##  Median :  21704   Median : 36.80   Median :16.00   Median :15.90  
##  Mean   :  40735   Mean   : 57.25   Mean   :16.33   Mean   :17.36  
##  3rd Qu.:  45664   3rd Qu.: 75.80   3rd Qu.:18.60   3rd Qu.:20.20  
##  Max.   :2035210   Max.   :299.80   Max.   :43.40   Max.   :64.00  
##                                                                    
##  median_household_income no_move_in_one_plus_year    poverty     
##  Min.   : 19351          Min.   : 51.60           Min.   : 0.00  
##  1st Qu.: 36353          1st Qu.: 83.60           1st Qu.:11.40  
##  Median : 41437          Median : 86.70           Median :15.10  
##  Mean   : 42386          Mean   : 86.09           Mean   :15.91  
##  3rd Qu.: 47175          3rd Qu.: 89.20           3rd Qu.:19.30  
##  Max.   :103643          Max.   :100.00           Max.   :53.50  
## 
lapply(countyRural[, -c(1:3)], hist)

## $pop2010
## $breaks
##  [1]       0  200000  400000  600000  800000 1000000 1200000 1400000
##  [9] 1600000 1800000 2000000 2200000
## 
## $counts
##  [1] 2692   43    6    1    3    0    0    0    0    1    1
## 
## $density
##  [1] 4.899891e-06 7.826720e-08 1.092100e-08 1.820167e-09 5.460502e-09
##  [6] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.820167e-09
## [11] 1.820167e-09
## 
## $mids
##  [1]  100000  300000  500000  700000  900000 1100000 1300000 1500000
##  [9] 1700000 1900000 2100000
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $density
## $breaks
##  [1]   0  20  40  60  80 100 120 140 160 180 200 220 240 260 280 300
## 
## $counts
##  [1] 880 583 381 249 172 112  85  56  49  42  33  34  25  22  24
## 
## $density
##  [1] 0.0160174736 0.0106115763 0.0069348380 0.0045322170 0.0031306880
##  [6] 0.0020385876 0.0015471423 0.0010192938 0.0008918821 0.0007644703
## [11] 0.0006006553 0.0006188569 0.0004550419 0.0004004368 0.0004368402
## 
## $mids
##  [1]  10  30  50  70  90 110 130 150 170 190 210 230 250 270 290
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $age_over_65
## $breaks
##  [1]  0  5 10 15 20 25 30 35 40 45
## 
## $counts
## [1]    4  125  956 1194  383   74   10    0    1
## 
## $density
## [1] 0.0002912268 0.0091008373 0.0696032035 0.0869311977 0.0278849654
## [6] 0.0053876957 0.0007280670 0.0000000000 0.0000728067
## 
## $mids
## [1]  2.5  7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $bachelors
## $breaks
##  [1]  0  5 10 15 20 25 30 35 40 45 50 55 60 65
## 
## $counts
##  [1]   4 239 950 851 371 178  83  32  21  13   3   1   1
## 
## $density
##  [1] 0.0002912268 0.0174008009 0.0691663633 0.0619585002 0.0270112850
##  [6] 0.0129595923 0.0060429560 0.0023298143 0.0015289407 0.0009464871
## [11] 0.0002184201 0.0000728067 0.0000728067
## 
## $mids
##  [1]  2.5  7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $median_household_income
## $breaks
##  [1]  15000  20000  25000  30000  35000  40000  45000  50000  55000  60000
## [11]  65000  70000  75000  80000  85000  90000  95000 100000 105000
## 
## $counts
##  [1]   3  39 128 373 624 684 415 227 121  67  35  15   7   8   0   0   0
## [18]   1
## 
## $density
##  [1] 2.184201e-07 2.839461e-06 9.319257e-06 2.715690e-05 4.543138e-05
##  [6] 4.979978e-05 3.021478e-05 1.652712e-05 8.809610e-06 4.878049e-06
## [11] 2.548234e-06 1.092100e-06 5.096469e-07 5.824536e-07 0.000000e+00
## [16] 0.000000e+00 0.000000e+00 7.280670e-08
## 
## $mids
##  [1]  17500  22500  27500  32500  37500  42500  47500  52500  57500  62500
## [11]  67500  72500  77500  82500  87500  92500  97500 102500
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $no_move_in_one_plus_year
## $breaks
##  [1]  50  55  60  65  70  75  80  85  90  95 100
## 
## $counts
##  [1]    1    2    1   13   50  182  724 1275  471   28
## 
## $density
##  [1] 0.0000728067 0.0001456134 0.0000728067 0.0009464871 0.0036403349
##  [6] 0.0132508191 0.0527120495 0.0928285402 0.0342919549 0.0020385876
## 
## $mids
##  [1] 52.5 57.5 62.5 67.5 72.5 77.5 82.5 87.5 92.5 97.5
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $poverty
## $breaks
##  [1]  0  5 10 15 20 25 30 35 40 45 50 55
## 
## $counts
##  [1]  25 431 914 781 375 138  54  15   8   5   1
## 
## $density
##  [1] 0.0018201675 0.0313796869 0.0665453222 0.0568620313 0.0273025118
##  [6] 0.0100473244 0.0039315617 0.0010921005 0.0005824536 0.0003640335
## [11] 0.0000728067
## 
## $mids
##  [1]  2.5  7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
#hist(countyRural$median_household_income, breaks = 100)

5. PREPARING AND TRANSFORMING DATA

# Copy customer data into new data frame
new_data = countyRural[ , c("pop2010", "density", "age_over_65", "bachelors")]

# Remove customer id as a variable, store it as row names
head(new_data)
##   pop2010 density age_over_65 bachelors
## 1   54571    91.8        12.0      21.7
## 2  182265   114.6        16.8      26.8
## 3   27457    31.0        14.2      13.5
## 4   22915    36.8        12.7      10.0
## 5   57322    88.9        14.7      12.5
## 6   10914    17.5        13.5      12.0
row.names(new_data) = new_data$FIPS
new_data$FIPS = NULL
head(new_data)
##   pop2010 density age_over_65 bachelors
## 1   54571    91.8        12.0      21.7
## 2  182265   114.6        16.8      26.8
## 3   27457    31.0        14.2      13.5
## 4   22915    36.8        12.7      10.0
## 5   57322    88.9        14.7      12.5
## 6   10914    17.5        13.5      12.0

# Take the log-transform of the data, and plot
#new_data$median_household_income = log(new_data$median_household_income)
new_data <- data.frame(lapply(new_data, log))
lapply(new_data, hist)

## $pop2010
## $breaks
##  [1]  4  5  6  7  8  9 10 11 12 13 14 15
## 
## $counts
##  [1]   2   1  35 124 374 853 860 412  78   6   2
## 
## $density
##  [1] 0.0007280670 0.0003640335 0.0127411722 0.0451401529 0.1361485257
##  [6] 0.3105205679 0.3130688023 0.1499817983 0.0283946123 0.0021842009
## [11] 0.0007280670
## 
## $mids
##  [1]  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5 14.5
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $density
## $breaks
##  [1] -2.5 -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5  2.0  2.5  3.0  3.5  4.0
## [15]  4.5  5.0  5.5  6.0
## 
## $counts
##  [1]   5   2  10  17  35  54  89 128 116 168 255 398 484 430 294 199  62
## 
## $density
##  [1] 0.003641661 0.001456664 0.007283321 0.012381646 0.025491624
##  [6] 0.039329934 0.064821559 0.093226511 0.084486526 0.122359796
## [11] 0.185724690 0.289876184 0.352512746 0.313182811 0.214129643
## [16] 0.144938092 0.045156591
## 
## $mids
##  [1] -2.25 -1.75 -1.25 -0.75 -0.25  0.25  0.75  1.25  1.75  2.25  2.75
## [12]  3.25  3.75  4.25  4.75  5.25  5.75
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $age_over_65
## $breaks
##  [1] 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8
## 
## $counts
##  [1]   2   2   6  17  42 148 404 908 750 372  84  11   1
## 
## $density
##  [1] 0.003640335 0.003640335 0.010921005 0.030942847 0.076447033
##  [6] 0.269384783 0.735347652 1.652712050 1.365125592 0.677102293
## [11] 0.152894066 0.020021842 0.001820167
## 
## $mids
##  [1] 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9 3.1 3.3 3.5 3.7
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## $bachelors
## $breaks
##  [1] 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0 4.2
## 
## $counts
##  [1]   1   3   5  21 114 249 457 628 566 356 191  95  39  20   2
## 
## $density
##  [1] 0.001820167 0.005460502 0.009100837 0.038223517 0.207499090
##  [6] 0.453221696 0.831816527 1.143065162 1.030214780 0.647979614
## [11] 0.347651984 0.172915908 0.070986531 0.036403349 0.003640335
## 
## $mids
##  [1] 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9 3.1 3.3 3.5 3.7 3.9 4.1
## 
## $xname
## [1] "X[[i]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

# Standardize variables
new_data = scale(new_data)
head(new_data)
##          pop2010 density age_over_65  bachelors
## [1,]  0.81147855     NaN  -1.0625413  0.7929683
## [2,]  1.82507608     NaN   0.2333486  1.3624904
## [3,]  0.23416099     NaN  -0.4142143 -0.4875685
## [4,]  0.08217551     NaN  -0.8441848 -1.2972539
## [5,]  0.85281548     NaN  -0.2809345 -0.6952102
## [6,] -0.54125465     NaN  -0.6089115 -0.8053484

6. RUNNING A HIERARCHICAL SEGMENTATION

# Compute distance metrics on standardized data
# This will likely generate an error on most machines
# d = dist(new_data)

# Take a 10% sample
#sample = seq(1, 18417, by = 10)
#head(sample)
#customers_sample = customers[sample, ]
#new_data_sample  = new_data[sample, ]

# Compute distance metrics on standardized data
d = dist(new_data)

# Perform hierarchical clustering on distance metrics
c = hclust(d, method="ward.D2")

# Plot de dendogram
plot(c)


# Cut at 9 segments
members = cutree(c, k = 5) #Dendogram appears to show that there 5 distinctive groups

# Show 30 first customers, frequency table
members[1:30]
##  [1] 1 1 2 3 3 3 3 1 3 3 2 3 3 3 3 1 4 3 3 3 3 4 4 4 3 1 3 4 3 3
table(members)
## members
##   1   2   3   4   5 
## 513 233 734 541 726

# Show profile of each segment
aggregate(countyRural[, -c(1:3)], by = list(members), mean)
##   Group.1    pop2010   density age_over_65 bachelors
## 1       1 110628.487 123.80136    12.96257  26.19025
## 2       2  32140.403  52.88455    11.44163  14.60558
## 3       3  20805.881  41.90327    15.65831  11.47888
## 4       4  49537.026  76.54048    17.21645  18.52680
## 5       5   7693.536  12.75647    20.30482  17.06983
##   median_household_income no_move_in_one_plus_year  poverty
## 1                50254.31                 82.57758 13.89006
## 2                43935.20                 84.55966 18.47811
## 3                36723.44                 87.00395 19.53365
## 4                43963.12                 86.27079 14.16599
## 5                40877.81                 88.00413 14.15014

7. Check the characteristics of outliers in each cluster

The outliers should provide role models for struggling towns in each cluster.

library(knitr) # for kable()
library(kableExtra)

countyRural <- cbind(countyRural, members)
head(countyRural)
##     state           name FIPS pop2010 density age_over_65 bachelors
## 1 Alabama Autauga County 1001   54571    91.8        12.0      21.7
## 2 Alabama Baldwin County 1003  182265   114.6        16.8      26.8
## 3 Alabama Barbour County 1005   27457    31.0        14.2      13.5
## 4 Alabama    Bibb County 1007   22915    36.8        12.7      10.0
## 5 Alabama  Blount County 1009   57322    88.9        14.7      12.5
## 6 Alabama Bullock County 1011   10914    17.5        13.5      12.0
##   median_household_income no_move_in_one_plus_year poverty members
## 1                   53255                     86.3    10.6       1
## 2                   50147                     83.0    12.2       1
## 3                   33219                     83.0    25.0       2
## 4                   41770                     90.5    12.6       3
## 5                   45549                     87.2    13.4       3
## 6                   31602                     88.5    25.3       3

ggplot(data = countyRural, 
       aes(x = factor(members), y = median_household_income)) + 
  geom_boxplot()


countyRural %>%
  group_by(members) %>%
  top_n(3, median_household_income) %>%
  arrange(members, desc(median_household_income)) %>%
  kable()
state name FIPS pop2010 density age_over_65 bachelors median_household_income no_move_in_one_plus_year poverty members
New Mexico Los Alamos County 35028 17950 164.4 15.0 64.0 103643 86.0 2.4 1
Virginia Fauquier County 51061 65203 100.7 12.7 30.8 83877 90.1 5.4 1
Massachusetts Nantucket County 25019 10172 226.2 12.1 40.2 83347 89.7 7.2 1
Alaska Bristol Bay Borough 2060 997 2.0 8.3 15.9 84000 77.7 5.0 2
Colorado Elbert County 8039 23086 12.5 9.5 29.2 78958 90.9 3.5 2
Virginia King George County 51099 23584 131.3 10.2 30.5 76241 83.3 7.1 2
Nevada Lander County 32015 5775 1.1 11.8 12.9 66525 80.6 12.2 3
Louisiana Cameron Parish 22023 6839 5.3 12.9 11.7 59555 87.6 11.6 3
Nevada Pershing County 32027 6753 1.1 13.0 12.4 56491 81.2 13.7 3
Virginia Fluvanna County 51065 25691 89.8 15.7 25.8 68223 91.7 7.5 4
Virginia King William County 51101 15935 58.2 12.4 19.8 64946 91.8 7.6 4
Virginia Botetourt County 51023 33148 61.2 16.4 22.9 64724 91.6 5.6 4
Texas Loving County 48301 82 0.1 14.6 17.9 83889 92.7 0.0 5
Colorado Hinsdale County 8053 843 0.8 17.4 42.0 74659 90.2 3.7 5
Texas Hartley County 48205 6062 4.1 12.5 19.8 66583 79.0 9.3 5

countyRural %>%
  filter(state == "New Hampshire")
##           state             name  FIPS pop2010 density age_over_65
## 1 New Hampshire   Belknap County 33001   60088   150.1        16.7
## 2 New Hampshire   Carroll County 33003   47818    51.4        20.6
## 3 New Hampshire  Cheshire County 33005   77117   109.1        14.7
## 4 New Hampshire      Coos County 33007   33055    18.4        19.4
## 5 New Hampshire   Grafton County 33009   89118    52.2        15.5
## 6 New Hampshire Merrimack County 33013  146445   156.8        13.7
## 7 New Hampshire  Sullivan County 33019   43742    81.4        16.5
##   bachelors median_household_income no_move_in_one_plus_year poverty
## 1      26.9                   54929                     88.7     8.6
## 2      30.3                   49897                     89.4     9.6
## 3      29.8                   53828                     84.8    10.0
## 4      16.1                   41534                     85.4    12.1
## 5      35.3                   53075                     84.0     9.8
## 6      32.7                   63012                     85.2     8.1
## 7      25.9                   50689                     87.0    10.0
##   members
## 1       1
## 2       4
## 3       1
## 4       3
## 5       1
## 6       1
## 7       1