Loading the data and initial cleaning

Cleaning the education data

education <- 
  read_xlsx("Education.xlsx", skip = 3) |> 
  # Making the names R friendly
  janitor::clean_names() |> 
  # Connecticut has missing values for 2022, we'll use the next newest year
  mutate(
    # # rucc
    # x2023_rurual_urban_continuum_code = if_else(
    #   is.na(x2023_rural_urban_continuum_code),
    #   x2013_rural_urban_continuum_code,
    #   x2023_rural_urban_continuum_code
    # ),
    # No high school
    percent_of_adults_with_less_than_a_high_school_diploma_2018_22 = if_else(
      is.na(percent_of_adults_with_less_than_a_high_school_diploma_2018_22),
      percent_of_adults_with_less_than_a_high_school_diploma_2008_12,
      percent_of_adults_with_less_than_a_high_school_diploma_2018_22
      
    ),
    
    # high school
    percent_of_adults_with_a_high_school_diploma_only_2018_22 = if_else(
      is.na(percent_of_adults_with_a_high_school_diploma_only_2018_22),
      percent_of_adults_with_a_high_school_diploma_only_2008_12,
      percent_of_adults_with_a_high_school_diploma_only_2018_22
      
    ),
    
    # some college
    percent_of_adults_completing_some_college_or_associates_degree_2018_22 = if_else(
      is.na(percent_of_adults_completing_some_college_or_associates_degree_2018_22),
      percent_of_adults_completing_some_college_or_associates_degree_2008_12,
      percent_of_adults_completing_some_college_or_associates_degree_2018_22
      
    ),
    
    # bachelor
    percent_of_adults_with_a_bachelors_degree_or_higher_2018_22 = if_else(
      is.na(percent_of_adults_with_a_bachelors_degree_or_higher_2018_22),
      percent_of_adults_with_a_bachelors_degree_or_higher_2008_12,
      percent_of_adults_with_a_bachelors_degree_or_higher_2018_22
      
    )
    
  ) |> 
  # Keeping specific columns and renaming some
  dplyr::select(fips = fips_code, 
                state, 
                area = area_name,
                contains("2023"),
                ends_with("22")) |> 
  # Removing the education counts columns (keep percentage)
  dplyr::select(
    fips:area,
    x2023_rural_urban_continuum_code,
    starts_with("percent")
  ) |>  
  # Removing states (fips are multiples of 1000)
  filter(as.numeric(fips) %% 1000 != 0)

# Changing the columns names
colnames(education) <- c("fips", "state", "county", "rucc", 
                         "no_hs", "high_school", "some_col", "bach")

Loading and initial cleaning of the poverty data

poverty <- 
  read_xlsx("PovertyEstimates.xlsx", skip = 4) |> 
  # Making the names R friendly
  janitor::clean_names() |> 
  # Keeping ID columns, poverty percent columns, and median HH income
  dplyr::select(fips = fips_code, 
                state = stabr, 
                area = area_name,
                contains("pct"),
                medhhinc = medhhinc_2021) |>  
  # Removing states (fips are multiples of 1000)
  filter(as.numeric(fips) %% 1000 != 0)

# Changing the columns names
colnames(poverty) <- c("fips", "state", "county", "poverty", "minor_poverty",
                       "child_poverty", "infant_poverty", "medhhinc")

Loading and initial data cleaning the unemployment data

unemploy <- 
  read_xlsx("Unemployment.xlsx", skip = 4) |> 
  # Making the names R friendly
  janitor::clean_names()   |> 
  # Keeping specific columns and renaming some
  dplyr::select(fips = fips_code, 
                state, 
                county = area_name,
                ends_with("2021"),
                ends_with("2022")) |> 
  # Removing the education counts columns (keep percentage)
  dplyr::select(
    fips:county,
    unemployment = unemployment_rate_2022,
    mhhi_per = med_hh_income_percent_of_state_total_2021
  ) |>  
  # Removing states (fips are multiples of 1000)
  filter(as.numeric(fips) %% 1000 != 0)

Merging the 3 data sets together named counties by fips, state, and county

counties <-
  # Merging education and poverty
  inner_join(
    x = education,
    y = poverty |> dplyr::select(-county),
    by = c("fips", "state")
  ) |> 
  inner_join(
    y = unemploy |> dplyr::select(-county),
    # Unemploy does county, state for the county column \:<
    by = c("fips", "state")
  ) |> 
  # Removing Washington DC
  filter(state != "DC") |> 
  # Creating row names: county, state
  mutate(county_state = paste0(county, ", ", state)) |> 
  column_to_rownames(var = "county_state") |> 
  dplyr::select(-county, -state)

skimr::skim(counties)
Data summary
Name counties
Number of rows 3141
Number of columns 13
_______________________
Column type frequency:
character 1
numeric 12
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
fips 0 1 5 5 0 3141 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
rucc 8 1 5.26 2.93 1.00 2.00 6.00 8.00 9.00 ▆▅▁▅▇
no_hs 0 1 11.65 5.72 0.31 7.47 10.46 14.72 66.67 ▇▃▁▁▁
high_school 0 1 33.95 7.55 7.13 29.11 34.25 39.37 55.68 ▁▂▇▆▁
some_col 0 1 30.94 5.34 3.45 27.38 30.96 34.31 80.95 ▁▇▂▁▁
bach 0 1 23.46 10.01 0.00 16.51 20.94 27.86 78.87 ▃▇▂▁▁
poverty 0 1 14.61 5.66 2.90 10.60 13.60 17.60 43.90 ▅▇▂▁▁
minor_poverty 0 1 20.05 8.41 2.80 13.80 19.10 24.80 58.50 ▅▇▃▁▁
child_poverty 0 1 18.90 8.14 2.40 12.80 17.80 23.40 61.10 ▅▇▂▁▁
infant_poverty 3141 0 NaN NA NA NA NA NA NA
medhhinc 0 1 58931.40 15259.34 25653.00 49014.00 56632.00 65682.00 153716.00 ▅▇▁▁▁
unemployment 0 1 3.59 1.23 0.60 2.70 3.40 4.20 14.70 ▇▇▁▁▁
mhhi_per 0 1 89.36 19.75 38.40 76.30 86.90 99.20 246.90 ▅▇▁▁▁

Initial take away from the initial formation of the counties data is that the infant poverty rate (0 - 4 years old) is a state-level variable, not a county-level feature, so we’ll remove it during the data preparation step.

# Let's drop the infant poverty column and removing counties with missing values
counties <- 
  counties |> 
  dplyr::select(-infant_poverty) 


skimr::skim(counties)
Data summary
Name counties
Number of rows 3141
Number of columns 12
_______________________
Column type frequency:
character 1
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
fips 0 1 5 5 0 3141 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
rucc 8 1 5.26 2.93 1.00 2.00 6.00 8.00 9.00 ▆▅▁▅▇
no_hs 0 1 11.65 5.72 0.31 7.47 10.46 14.72 66.67 ▇▃▁▁▁
high_school 0 1 33.95 7.55 7.13 29.11 34.25 39.37 55.68 ▁▂▇▆▁
some_col 0 1 30.94 5.34 3.45 27.38 30.96 34.31 80.95 ▁▇▂▁▁
bach 0 1 23.46 10.01 0.00 16.51 20.94 27.86 78.87 ▃▇▂▁▁
poverty 0 1 14.61 5.66 2.90 10.60 13.60 17.60 43.90 ▅▇▂▁▁
minor_poverty 0 1 20.05 8.41 2.80 13.80 19.10 24.80 58.50 ▅▇▃▁▁
child_poverty 0 1 18.90 8.14 2.40 12.80 17.80 23.40 61.10 ▅▇▂▁▁
medhhinc 0 1 58931.40 15259.34 25653.00 49014.00 56632.00 65682.00 153716.00 ▅▇▁▁▁
unemployment 0 1 3.59 1.23 0.60 2.70 3.40 4.20 14.70 ▇▇▁▁▁
mhhi_per 0 1 89.36 19.75 38.40 76.30 86.90 99.20 246.90 ▅▇▁▁▁

Removed Counties

The counties data set has 3141 rows (1 row = 1 county), while the other three full data sets have more rows, indicating that not all rows were retained during the joining process. Let’s check out which counties are “left behind” after our inner joins

Education

education |> 
  # Looking at rows in education but not in counties
  filter(!(fips %in% counties$fips)) |> 
  count(state) |> 
  arrange(-n) |> 
  rename(`drop rows` = n) |> 
  gt::gt()
state drop rows
PR 78
CT 9
AK 8
VA 2
DC 1
HI 1
MT 1

The dropped rows in education are mostly from Puerto Rico (PR), Connecticut (CT), and Alaska (AK). Puerto Rico shouldn’t be included since it is not one of the states, but what are the rows in the 50 states?

education |> 
  # Looking at rows in education but not in counties
  filter(
    !(fips %in% counties$fips), 
    state %in% c("AK", "CT")
  )
## # A tibble: 17 × 8
##    fips  state county                      rucc no_hs high_school some_col  bach
##    <chr> <chr> <chr>                      <dbl> <dbl>       <dbl>    <dbl> <dbl>
##  1 02010 AK    Aleutian Islands              NA NA           NA       NA    NA  
##  2 02160 AK    Kuskokwim Division            NA NA           NA       NA    NA  
##  3 02201 AK    Prince of Wales-Outer Ket…    NA NA           NA       NA    NA  
##  4 02231 AK    Skagway-Yakutat-Angoon Ce…    NA NA           NA       NA    NA  
##  5 02232 AK    Skagway-Hoonah-Angoon Cen…    NA NA           NA       NA    NA  
##  6 02250 AK    Upper Yukon Division          NA NA           NA       NA    NA  
##  7 02261 AK    Valdez-Cordova Census Area    NA 10.0         23.6     42.7  23.6
##  8 02280 AK    Wrangell-Petersburg Censu…    NA NA           NA       NA    NA  
##  9 09110 CT    Capitol Planning Region, …     1  8.99        25.2     24.7  41.1
## 10 09120 CT    Greater Bridgeport Planni…     2 11.7         25.2     21.4  41.7
## 11 09130 CT    Lower Connecticut River V…     1  4.73        23.8     26.2  45.3
## 12 09140 CT    Naugatuck Valley Planning…     2  9.65        30.1     27.0  33.3
## 13 09150 CT    Northeastern Connecticut …     4  8.72        34.8     31.4  25.0
## 14 09160 CT    Northwest Hills Planning …     4  6.81        28.6     26.7  37.9
## 15 09170 CT    South Central Connecticut…     2  8.57        28.4     22.6  40.4
## 16 09180 CT    Southeastern Connecticut …     2  7.61        29.6     29.0  33.8
## 17 09190 CT    Western Connecticut Plann…     2  8.36        18.5     18.9  54.2

For Alaska and Connecticut, the rows correspond to not counties, but either census areas (AK) and planning regions (CT). Since they don’t represent counties, dropping them is correct. What about Virginia (VA), Hawaii (HI) and Montana (MT)?

education |> 
  # Looking at rows in education but not in counties
  filter(
    !(fips %in% counties$fips),
    state %in% c("VA", "HI", "MT")
  )
## # A tibble: 4 × 8
##   fips  state county                     rucc no_hs high_school some_col  bach
##   <chr> <chr> <chr>                     <dbl> <dbl>       <dbl>    <dbl> <dbl>
## 1 15005 HI    Kalawao County                3   8          34       24    34  
## 2 30113 MT    Yellowstone National Park    NA  NA          NA       NA    NA  
## 3 51515 VA    Bedford city                 NA  15.2        33.1     30.4  21.3
## 4 51560 VA    Clifton Forge city           NA  NA          NA       NA    NA

For the two cities in Virginia, they used to be independent cities and not considered part of a county, but are now incorporated into counties, so we’ll keep excluding them. Yellowstone National Park is also not a county, but a national park (as the name suggests). Kalawao County, Hawaii is the smallest county in the country by land area, has a population of 82 (as of 2020), doesn’t have an elected government, and is a judicial district of Maui county.

All of the excluded counties in the education data set are appropriately left out of the full data set. But what about the two in the poverty data set that are excluded?

Poverty

poverty |> 
  # Looking at rows in education but not in counties
  filter(!(fips %in% counties$fips))
## # A tibble: 2 × 8
##   fips  state county poverty minor_poverty child_poverty infant_poverty medhhinc
##   <chr> <chr> <chr>    <dbl>         <dbl>         <dbl>          <dbl>    <dbl>
## 1 11001 DC    Distr…    16.8          25.5          25.5             NA    91072
## 2 15005 HI    Kalaw…    NA            NA            NA               NA       NA

One is the District of Columbia (not a county), and the other is the Hawaiian county we saw in the education data!

Unemployment

unemploy |> 
  # Looking at rows in unemploy but not in counties
  filter(!(fips %in% counties$fips)) |> 
  count(state)
## # A tibble: 3 × 2
##   state     n
##   <chr> <int>
## 1 AK        4
## 2 DC        1
## 3 PR       78

It appears that we have the same non-counties as the education data (DC and PR), and likely the same problem with Alaska, but let’s check Alaska again since there are only 4 rows not included instead of 8, just in case there is something new

unemploy |> 
  # Looking at rows in unemploy but not in counties
  filter(
    !(fips %in% counties$fips),
    state == "AK"
  )
## # A tibble: 4 × 5
##   fips  state county                                       unemployment mhhi_per
##   <chr> <chr> <chr>                                               <dbl>    <dbl>
## 1 02201 AK    Prince of Wales-Outer Ketchikan Census Area…           NA       NA
## 2 02232 AK    Skagway-Hoonah-Angoon Census Area, AK                  NA       NA
## 3 02261 AK    Valdez-Cordova Census Area, AK                         NA       NA
## 4 02280 AK    Wrangell-Petersburg Census Area, AK                    NA       NA

Nothing new, same rows as the education data set, just fewer census areas

Data Prep before clustering

EDA Plots

Let’s create a set of box plots for each of the percentage columns to get an idea of the range of each feature:

boxplot_counties <- 
  counties |> 
  rename(
    `No High school` = no_hs,
    `High School Only` = high_school,
    `Some College` = some_col,
    `Bachelor or Higher` = bach,
    Poverty = poverty,
    `Minor Poverty` = minor_poverty,
    `Child Poverty` = child_poverty,
    Unemployment = unemployment,
    `Median Home Income (Relavetive)` = mhhi_per
  ) |> 
  pivot_longer(
    cols = c(-fips, -rucc, -medhhinc),
    names_to = "feature",
    values_to = "percentage"
  ) |> 
  ggplot(
    mapping = aes(
      x = percentage/100
    )
  ) +
  geom_boxplot(
    fill = "steelblue"
  ) + 
  labs(
    x = NULL,
    title = "US County Features"
  ) +
  
  theme_bw() +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    plot.title = element_text(hjust = 0.5),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank()
  )

boxplot_counties +
  # Changing x-axis to a percentage
  scale_x_continuous(labels = scales::label_percent()) + 
  # a plot per feature
  facet_wrap(
    facets = vars(as_factor(feature)),
    #scales = "free"
  )

Since many features have large outliers, we need a method of rescaling the data that is not sensitive to the large outliers. We’ll use robust scaling, which is similar to standardization, but replaces the mean and standard deviation with the median and IQR:

\[z_i = \frac{X_i - \textrm{median}(X)}{\textrm{IQR}(X)}\]

Robust scaling maintains the large values while not compressing the remaining values like standardization or MinMax would.

For now, let’s create a correlation plot for our percentage variables. We don’t need to rescale the data first since correlation is unaffected by linear transformations (like robust rescaling).

counties |> 
  dplyr::select(
    no_HS =no_hs, hs = high_school, some_col:poverty, 
    minor_pov = minor_poverty, child_pov = child_poverty,
    unemploy = unemployment, mhhi_per
  ) |> 
  GGally::ggcorr(
    low = "tomato",
    mid = "white",
    high = "steelblue",
    label = T,
    label_round = 2,
    hjust = 0.65
  ) +
  theme_void()

There is a large amount of correlation in the data, which isn’t surprising since several variables measure similar characteristics, like the set of education features and the set of poverty variables. Specifically, the 3 poverty variables have very, very high pairwise correlations from 0.93 to 0.99 (!). We’ll likely need to remove some of the redundant, highly correlated variables. Before we do, let’s use robust scaling to place the variables on a similar scale.

counties_res <- 
  counties |> 
  # Using robust scaling with mutate and across
  mutate(
    across(
      .cols = c(no_hs:child_poverty, unemployment:mhhi_per),
      .fns  = ~ (. - median(.)) / IQR(.)
    )
  ) |> 
  # Dropping median household income, using relative mhhi instead
  dplyr::select(-medhhinc)

# Checking that the data have been scaled properly: Med = 0 and IQR = 1
counties_res |> 
  pivot_longer(
    cols = c(no_hs:mhhi_per),
    names_to = "features",
    values_to = "percent"
  ) |> 
  # Five number summary for each feature
  summarize(
    .by = features,
    min = min(percent),
    Q1 = quantile(percent, probs = 0.25),
    median = median(percent),
    Q3 = quantile(percent, probs = 0.25),
    max = max(percent),
    IQR = IQR(percent)
  ) 
## # A tibble: 9 × 7
##   features        min     Q1 median     Q3   max   IQR
##   <chr>         <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl>
## 1 no_hs         -1.40 -0.412      0 -0.412  7.74     1
## 2 high_school   -2.64 -0.501      0 -0.501  2.09     1
## 3 some_col      -3.97 -0.517      0 -0.517  7.21     1
## 4 bach          -1.84 -0.390      0 -0.390  5.10     1
## 5 poverty       -1.53 -0.429      0 -0.429  4.33     1
## 6 minor_poverty -1.48 -0.482      0 -0.482  3.58     1
## 7 child_poverty -1.45 -0.472      0 -0.472  4.08     1
## 8 unemployment  -1.87 -0.467      0 -0.467  7.53     1
## 9 mhhi_per      -2.12 -0.463      0 -0.463  6.99     1

We can check the boxplots and see if they are all on a similar scale

(boxplot_counties + 
   scale_x_continuous() +
   
   facet_wrap(
     facets = vars(as_factor(feature)),
     #scales = "free"
   )) %+% 
  # Replacing the data set in the original graph
  (
    counties_res |> 
      rename(
        `No High school` = no_hs,
        `High School Only` = high_school,
        `Some College` = some_col,
        `Bachelor or Higher` = bach,
        Poverty = poverty,
        `Minor Poverty` = minor_poverty,
        `Child Poverty` = child_poverty,
        Unemployment = unemployment,
        `Median Home Income (Relavetive)` = mhhi_per
      ) |> 
      pivot_longer(
        cols = c(-rucc, -fips),
        names_to = "feature",
        values_to = "percentage"
      ) |> 
      # Need to multiply percentage by 100 to undo the divide by 100 in ggplot()
      mutate(percentage = percentage * 100)
  )

As mentioned previously, linear transformations (like robust scaling) don’t affect the correlations, which we can check by creating the correlation plot again

counties_res |> 
  dplyr::select(
    no_HS =no_hs, hs = high_school, some_col:poverty, 
    minor_pov = minor_poverty, child_pov = child_poverty,
    unemploy = unemployment, mhhi_per
  ) |> 
  GGally::ggcorr(
    low = "tomato",
    mid = "white",
    high = "steelblue",
    label = T,
    label_round = 2,
    hjust = 0.65
  ) +
  theme_void()

Same graph as earlier!

Multicollinearity check

Multicollinearity in a data set occurs when the explanatory variables are either perfectly or strongly predictive of one another. While this may sound like a positive characteristic, we only want variables to be predictive of a response variable, not predictive of each other.

Principal Component Analysis

One tool to check for multicollinearity is principal component analysis (PCA), which “rotates” the data along its primary axis, creating a set of uncorrelated variables. The main idea is that by uncorrelating the data, the last few principal components (PCs) should be very small since they shouldn’t contain much information.

We can use PCA to check for multicollinearity by examining the last few PCs. If they make up 0% or approximately 0% of the variation in the data, that indicates there are either exact or near exact linear relationships. We can then use the PC to find which sets features are strongly related and how they are related. The big advantage of PCA over a correlation plot is that it can find linear associations between any number of features, not just two at a time!

counties_pca <- 
  prcomp(x = counties_res |> dplyr::select(-fips, -rucc))

# Creating a scree plot
factoextra::fviz_screeplot(
  counties_pca,
  geom = "bar",
  addlabels = T,
  hjust = 0.5
) +
  scale_y_continuous(
    breaks = seq(0, 100, 10),
    labels = ~ paste0(.x, "%"),
    minor_breaks = NULL,
    expand = c(0, 0, 0.05, 0)
  )

The screeplot above shows the percentage each PC contributes to the overall variance (size) of the data. The second to last and last PCs contribute very little and none, respectively, to the variance of the original data, indicating that there is an exact and near exact linear association between our 9 features. The third to last PC has a very small percentage of the variance, so we’ll look at it as well.

To check which sets of variables are involved in the linear association, we look at the columns of rotation matrix that correspond to the last three PCs. If the rotation is near 0 or negligible, the feature is not part of the linear association. If the rotation is not negligible, then it is an important part of the linear association.

To make it easier to read, we’ll create a graph

gg_pc_rot <- 
  counties_pca |> 
  pluck("rotation") |> 
  data.frame() |> 
  # Picking the last 3 PCs
  dplyr::select(PC7:PC9) |>
  # Changing the features from row names to a column
  rownames_to_column(var = "feature")  |> 
  pivot_longer(
    cols = -feature,
    names_to = "PC",
    values_to = "loading"
  ) |> 
  # Better names for the features for the graph
  mutate(
    feature = case_when(
      feature == "no_hs"         ~ "No High School",
      feature == "high_school"   ~ "High School Only",
      feature == "some_col"      ~ "Some College",
      feature == "bach"          ~ "Bachelor+",
      feature == "poverty"       ~ "Poverty",
      feature == "minor_poverty" ~ 'Minor Poverty',
      feature == "child_poverty" ~ 'Child Poverty',
      feature == "unemployment"  ~ 'Unemployment',
      feature == "mhhi_per"      ~ 'Median Home Income\n(Relavetive)'
    )
  ) |> 
  # Creating a tile plot
  ggplot(
    mapping = aes(
      x = PC,
      y = as_factor(feature),
      fill = abs(loading),
      label = round(loading, 3)
    )
  ) +
  geom_tile(
    color = "white"
  ) +
  # Add the loading value to the tile
  ggfittext::geom_fit_text(
    contrast = T
  ) +
  # Changing to a white (low) to blue (high) gradient
  scale_fill_gradient(
    low = "white", 
    high = "steelblue",
    limits = c(0, 1)
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  # Removing the buffer space
  coord_cartesian(expand = F) +
  # Changing the labels
  labs(
    title = "Displaying Which Features are Linearly Associated",
    x = NULL,
    y = NULL
  )

gg_pc_rot

The last PC (PC9) is a combination of the four education variables, indicating that at least one is redundant. Since these four features are percentages of a set of complete and disjoint categories (every person falls into one and only one category), the percentages have to add up to 100%. Therefore, if we know three of the education variables, we also know the fourth, meaning we can remove one from our data.

PCs 7 and 8 involve the 3 poverty percentages, which we saw were strongly associated in the correlation plot. We can drop one or two of these features as well.

We’ll start by removing bachelor degree and child poverty, then redoing PCA to see if another poverty variable should be removed. We’re removing bachelor degree because it has a high correlation with high school only and child poverty because it’s the least reported of the three poverty percentages.

counties_pca2 <- 
  counties_res |> 
  dplyr::select(-rucc, -bach, -child_poverty, -fips) |> 
  prcomp()

# Scree plot
factoextra::fviz_screeplot(
  counties_pca2,
  geom = "bar",
  addlabels = T,
  hjust = 0.5
) +
  scale_y_continuous(
    breaks = seq(0, 100, 10),
    labels = ~ paste0(.x, "%"),
    minor_breaks = NULL,
    expand = c(0, 0, 0.05, 0)
  )

The last PC accounts for less than 1% of the overall variance in the data, so we’ll remove another feature. While our previous results indicate that it is likely one of the poverty variables, we’ll create a similar tile plot to confirm.

(gg_pc_rot) %+%
  # Adding just the loadings of the 7th PC
  (  
    counties_pca2 |> 
      pluck("rotation") |> 
      data.frame() |> 
      # Picking the last 3 PCs
      dplyr::select(PC7) |>
      # Changing the features from row names to a column
      rownames_to_column(var = "feature")  |> 
      pivot_longer(
        cols = -feature,
        names_to = "PC",
        values_to = "loading"
      ) |> 
      # Better names for the features for the graph
      mutate(
        feature = case_when(
          feature == "no_hs"         ~ "No High School",
          feature == "high_school"   ~ "High School Only",
          feature == "some_col"      ~ "Some College",
          feature == "bach"          ~ "Bachelor+",
          feature == "poverty"       ~ "Poverty",
          feature == "minor_poverty" ~ 'Minor Poverty',
          feature == "child_poverty" ~ 'Child Poverty',
          feature == "unemployment"  ~ 'Unemployment',
          feature == "mhhi_per"      ~ 'Median Home Income\n(Relavetive)'
        )
      )
  )

As expected, there is still a strong linear association between minor poverty and poverty, so we’ll drop minor poverty. Why we’re dropping minor poverty instead of poverty is due to poverty percentage being a more widely reported figure than minor poverty, so it is more easily found for future analysis.

Variance Inflation Factor

A second way we can check for linear associations is with the variance inflation factor (VIF). How VIF works is it creates a set of linear models, treating each variable as a response and the remaining features as explanatory variables. From each model, the \(R^2\) value is calculated and the VIF for variable \(i\) is

\[\textrm{VIF}_i = \frac{1}{1 - R^2_i}\]

We consider a feature to have a high VIF if it is above 5 and a very high VIF if it is above 10. Let’s check the VIF for our full set of features and our reduced set of features

# All features
VIF_full <- 
  counties_res |> 
  # Since the edu vars are exactly dependent, we'll add a small value to bach
  # That way we can calculate a VIF for all the variables
  mutate(bach = bach + runif(n = nrow(counties_res), -0.01, 0.01)) |> 
  # Forming the linear model to predict rucc (response is not important)
  lm(formula = rucc ~ . - fips) |> 
  regclass::VIF() |> 
  data.frame() |> 
  rownames_to_column(var = "feature")

# No bach or child_poverty
VIF_partial <- 
  counties_res |> 
  lm(formula = rucc ~ . - bach - child_poverty - fips) |> 
  regclass::VIF() |> 
  data.frame() |> 
  rownames_to_column(var = "feature")

# No bach and only poverty
VIF_simple <- 
  counties_res |> 
  lm(formula = rucc ~ . - bach - child_poverty - minor_poverty - fips) |> 
  regclass::VIF() |> 
  data.frame() |> 
  rownames_to_column(var = "feature")

# Displaying the results
VIF_results <- 
  left_join(
    x = VIF_full,
    y = VIF_partial,
    by = "feature"
  ) |> 
  left_join(
    y = VIF_simple,
    by = "feature"
  ) |> 
  mutate(
    across(
      .cols = -feature,
      .fns = ~ round(., 1)
    ),
    feature = case_when(
      feature == "no_hs"         ~ "No High School",
      feature == "high_school"   ~ "High School Only",
      feature == "some_col"      ~ "Some College",
      feature == "bach"          ~ "Bachelor+",
      feature == "poverty"       ~ "Poverty",
      feature == "minor_poverty" ~ 'Minor Poverty',
      feature == "child_poverty" ~ 'Child Poverty',
      feature == "unemployment"  ~ 'Unemployment',
      feature == "mhhi_per"      ~ 'Rel Median Home Income'
    )
  )


colnames(VIF_results) <- c("Feature", "VIF1", "VIF2", "VIF3")

VIF_results |> 
  gt::gt() |> 
  gt::sub_missing(
    columns = 2:4,
    missing_text = ""
  ) |> 
  gt::cols_label(
    VIF1 = "Full Model",
    VIF2 = "Middle Model",
    VIF3 = "Simple Model"
  ) |> 
  gt::tab_spanner(
    label = "VIF",
    columns = -Feature
  )
Feature VIF
Full Model Middle Model Simple Model
No High School 7599.2 2.0 1.9
High School Only 13218.7 1.4 1.3
Some College 6605.5 1.4 1.4
Bachelor+ 23203.9

Poverty 9.0 9.0 2.6
Minor Poverty 85.5 9.5
Child Poverty 75.1

Unemployment 1.3 1.3 1.3
Rel Median Home Income 2.2 2.2 2.2

The results using VIF agree with our findings from PCA. For the full model, the four education variables all have VIF in the thousands, witch bachelor being over 23,000! The three poverty features also have high VIFs, especially minor and child poverty. For the middle set of features, there is some concern about MCL since the VIF for poverty and minor poverty are near 10. The set of six features: no high school, high school only, some college, poverty, unemployment, and relative median household income percent, do not have a VIF above 3, indicating no concern about linear associations!

Clustering Counties

The most common method to perform clustering is with k-means clustering. The big advantage of k-means clustering is that we only need to specify one parameter for the algorithm, \(k\). But what is \(k\)? It’s the different number of clusters we want the algorithm to group the cases (counties) in. The big drawback of k-means is that we need to specify the number of clusters, which is often one of the major points of interest in clustering analysis.

The two other drawbacks of k-means are:

  1. The method uses centroids (a p-dimensional vector of the averages for the features), which is susceptible to outliers, which are prevalent in our data.

  2. Each point (county) will be in one of the \(k\) clusters that are specified, so clusters can include outliers.

To combat outliers in the data, we will use k-medoids instead, which uses medians instead of means to form the center of the clusters. The drawback of k-medoids is that it is much more computationally expensive to calculate a p-dimensional median than a p-dimensional mean, so the algorithm runs significantly slower during each iteration.

k-medoids also has the drawback is needing to specify the choice of \(k\). So how do we determine a good choice for \(k\)?

Determining the number of clusters

Biplots

Often, determining the number of clusters is done visually. If we only have two features, we can create a scatterplot and look to see if there are obvious groups in the plot. If there are more than two features, we can create a biplot: a scatterplot of a pair of principal components. Often just the first and second principal components are used, but we could look at other sets as well, depending on how much of the variance is preserved in the data.

##                          PC1   PC2   PC3   PC4   PC5   PC6
## Standard deviation     1.355 0.826 0.715 0.635 0.475 0.414
## Proportion of Variance 0.479 0.178 0.133 0.105 0.059 0.045
## Cumulative Proportion  0.479 0.658 0.791 0.896 0.955 1.000

From the results of PCA, we’ll look at the pairwise plots of the first four PCs

From the pairwise scatterplots, there doesn’t appear to be an obvious, natural clusters, but there does appear to be several outlier counties!

Elbow plot

Another way of visually assessing how many clusters exist in the data is with an elbow plot. An elbow plot displays the “Within Sums of Squares” (WSS) on the y-axis (the total spread of the clusters) and the number of clusters on the x-axis. So it performs clustering from k = 1 up to k = k_max (decided by us) clusters and displays the WSS:

\[WSS = \sum_i\sum_j(y_{ij} - \bar{y}_i)^2\]

where \(i\) indicates the cluster and \(j\) indicates the \(j\)th observation in the \(i\)th cluster. It’s sort-of like a variance, controlling for the cluster each point is in, and not dividing by \(n - 1\). Lower cluster areas (smaller variance) indicates that there are better defined clusters in the data.

The downside of using WSS to judge how many clusters there are is as \(k\) increases, WSS always decreases. So we can’t just use the choice of \(k\) that makes WSS the smallest. Instead, we look for an “elbow” in the plot: where increasing from \(k\) to \(k + 1\) no longer causes a noticeable drop in WSS.

factoextra::fviz_nbclust(
  x = counties2,
  # Specifying to use k-mediods (pam() in the cluster package)
  FUNcluster = cluster::pam,
  method = "wss"
)

From the elbow plot, there is a noticeable drop in WSS from \(k = 1\) to \(k = 2\), then small drops for each proceeding increase in \(k\), indicating two clusters is the best choice. But the choice of \(k\) from an elbow plot can be subjective. So do we have more objective methods?

Silhouette Plot

A silhouette plot is similar to an elbow plot, in that is has the number of clusters on the x-axis and a metric to judge how well the data are clustered on the y-axis, called the silhouette score. So what is the silhouette score? It’s a little difficult to explain fully here, but it compares the cluster that point \(i\) was placed in to the “next best cluster” that it could have been clustered in. If the current cluster is a much better choice, then case \(i\) will have a large silhouette score, close to one. If the appropriateness of the current and next-best cluster are about the same, then the score will be near zero. The silhouette score displayed on a silhouette plot is the average of every case’s silhouette score.

The big advantage is that, unlike an elbow plot, the silhouette score isn’t monotonic (doesn’t always decrease) in \(k\) but can increase and decrease as \(k\) changes. We can use the choice of \(k\) that maximizes the silhouette score to decide what the best choice of \(k\) would be.

factoextra::fviz_nbclust(
  x = counties2,
  # Specifying to use k-medoids
  FUNcluster = cluster::pam,
  method = "silhouette"
)

The silhouette plot above agrees with our conclusion from the elbow plot that the best choice is two clusters. We’ll look at one final graph to help decide on the best choice of \(k\).

Gap Statistic

The gap statistic uses a simulated data set that is similar to the original data in dimension and spread, but has no true clusters. The simulated (fake) data are clustered using the same method and clusters as the real data, and the WSS are calculated for each choice of \(k\). This is repeated \(B\) times (usually 100 or 500 times), and the average1 WSS of these fake data is calculated for each choice of \(k\). The difference between the average WSS for a data set with no clusters is compared to the WSS for the real data. We can see an “improvement” in the real WSS when a new cluster is found by increasing \(k\), where as if all the clusters in the real data are already accounted for, the change in the WSS between the real and fake data should be similar.

Gap Stat Graph
Gap Stat Graph

The Gap Statistic recommends 5 clusters, which is quite a few more than the previous two methods. So which should we choose?

We can look at many different methods to help decide the best number of clusters, which we’ll do in the next section:

NBClust

The NBClust() function in the NBClust package evaluates the best choice of \(k\) using 30 different metrics and saves the results. The code chunk below visualizes the results with a bar chart of how often each method chooses a certain value of k

k_result <- NbClust::NbClust(data = counties2, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 11 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
# Displaying the choices
k_result$Best.nc |>
  t() |>
  data.frame() |>
  count(Number_clusters) |>
  ggplot(
    mapping = aes(
      x = Number_clusters,
      y = n
    )
  ) +
  geom_col(
    fill = "steelblue",
    color = "black"
  ) +
  geom_text(
    mapping = aes(label = n),
    vjust = -0.5
  ) +
  theme_classic() +
  labs(x = "Best choice of k",
       y = "Number of metrics") +
  scale_y_continuous(
    expand = c(0, 0, 0.05, 0),
    breaks = 0:7*2
  ) +
  scale_x_continuous(breaks = 0:15)

Eight metrics chose \(k = 2\) and six chose \(k = 3\), so we’ll start k-medoids with \(k = 2\) and after see how \(k = 3\) clusters the data.

k-medoids with two clusters

set.seed(1234)
counties_km2 <- 
  cluster::pam(
    x = counties2,
    k = 2
  )

fviz_cluster(
  counties_km2,
  geom = "point",
  #ellipse = F
) + 
  theme_minimal() + 
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "none"
  ) +
  scale_fill_manual(
    values = c("1" = "tomato", "2" = "steelblue")
  ) +
  scale_color_manual(
    values = c("1" = "tomato", "2" = "steelblue")
  )

There doesn’t appear to be too much of an obvious difference in the plot above using the first two PCs, just dividing a large cluster in two. Six methods recommended three clusters, only two fewer methods than what suggested two clusters? What does our results look like if \(k = 3\)?

set.seed(1234)
counties_km3 <- 
  cluster::pam(
    x = counties2,
    k = 3
  )

fviz_cluster(
  counties_km3,
  geom = "point",
  #ellipse = F
) + 
  theme_minimal() + 
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "none"
  ) +
  scale_fill_manual(
    values = c("1" = "tomato", "2" = "steelblue", "3" = "orange2")
  ) +
  scale_color_manual(
    values = c("1" = "tomato", "2" = "steelblue", "3" = "orange2")
  )

Like with \(k = 2\), \(k = 3\) just divides the large cluster into three groups.

Let’s compare the medoids of the two cluster and three cluster results

counties[
  # Keeping the rows that correspond to the medoid of each cluster
  rownames(counties) %in%
    c(counties_km2$medoids |> 
        data.frame() |> 
        rownames()),
] |> 
  # Dropping three irrelevant columns
  dplyr::select(-fips, -rucc, -medhhinc) |> 
  round(digits = 1) |> 
  # Moving the counties to a column
  rownames_to_column(var = "County") |>
  # Creating a graphic table with a title and subtitle
  gt::gt() |> 
  gt::tab_header(
    title = gt::md("Medoid County Results"),
    subtitle = gt::md("Two clusters")
  )
Medoid County Results
Two clusters
County no_hs high_school some_col bach poverty minor_poverty child_poverty unemployment mhhi_per
Clark County, IN 9.5 33.3 32.6 24.6 10.4 13.9 13.5 2.7 100.2
Cherokee County, SC 14.8 37.7 30.3 17.2 18.8 28.7 27.4 4.1 77.1

Clark County, IN and Cherokee County, SC are the median counties for the two clusters. One cluster (Clark) has a higher college participation percentage (some college and bachelor), lower poverty and unemployment rates, and a higher relative median household income percentage than the other cluster (Cherokee).

The two clusters appear to be divided by education and economic percentage: Higher college and income vs lower college percentage and income

Let’s compare the same for three clusters

counties[
  # Keeping the rows that correspond to the medoid of each cluster
  rownames(counties) %in%
    c(counties_km3$medoids |> 
        data.frame() |> 
        rownames()),
] |> 
  # Dropping three irrelevant columns
  dplyr::select(-fips, -rucc, -medhhinc) |> 
  round(digits = 1) |> 
  # Moving the counties to a column
  rownames_to_column(var = "County") |>
  # Creating a graphic table with a title and subtitle
  gt::gt() |> 
  gt::tab_header(
    title = gt::md("Medoid County Results"),
    subtitle = gt::md("Three clusters")
  )
Medoid County Results
Three clusters
County no_hs high_school some_col bach poverty minor_poverty child_poverty unemployment mhhi_per
Searcy County, AR 18.0 39.1 27.9 14.9 20.9 32.0 30.0 4.5 67.4
Marion County, FL 10.8 35.1 32.1 22.0 13.6 20.7 19.2 3.4 86.5
Lee County, GA 8.1 27.2 32.2 32.6 9.8 14.8 13.3 2.9 115.1

A similar result three clusters, just low, medium, and high groups for education and economic features.

Let’s visualize the counties clustering results

usmap::plot_usmap(
  regions = "counties",
  data = counties |> 
    add_column(cluster = factor(counties_km2$clustering)),
  values = "cluster",
  linewidth = 0.1
) +
  scale_fill_manual(
    values = c("tomato", "steelblue"),
    labels = c("High College & Income", "Low College & Income")
  ) +
  # Adding the state outlines
  # geom_polygon(
  #   data = state_map$data,
  #   mapping = aes(x = x, y = y, group = group),
  #   fill = NA,
  #   color = "black",
  #   linewidth = 0.35
  # ) +
  labs(
    title = "US Counties: Two Clusters"
  )  +
  theme(
    legend.position = "none",
    #legend.position = "top",
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold")
  ) +
  scale_x_continuous(expand = c(0, 0, 0, 0)) +
  scale_y_continuous(expand = c(0, 0, 0, 0))

usmap::plot_usmap(
  regions = "counties",
  data = 
    # Adding the cluster result to the counties data set
    left_join(
      x = counties |> rownames_to_column(var = "state_county"),
      y = data.frame(
        state_county = names(counties_km3$clustering),
        cluster = factor(counties_km3$clustering)
      ),
      by = "state_county"
    ),
  values = "cluster",
  linewidth = 0.1
) +
  # Having the colors match the previous results and add orange2 for cluster 3
  scale_fill_manual(
    values = c("1" = "tomato", "2" = "steelblue", "3" = "orange2")
  ) +
  # Adding a title
  labs(
    title = "US Counties: Three Clusters"
  )  +
  # Removing the legend and centering the title
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold")
  ) +
  # Removing the buffer space
  scale_x_continuous(expand = c(0, 0, 0, 0)) +
  scale_y_continuous(expand = c(0, 0, 0, 0))

Neither of the maps lend much insight to the clustering of the counties that isn’t already present.

Another drawback of k-means/medoids is that every county will be included in one of the specified cluster, even when the county is very different from every other county. We’d like to use a method that won’t cluster a county if it is an outlier. What clustering method allows for outliers?

Clustering with DBSCAN

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a different clustering algorithm that depends more on the distance each point is away from the closest, unclustered point.

To use DBSCAN, we need to specify two parameters:

  1. minPts = minimum number of cases needed to start a new cluster

  2. epsilon = the distance each case will “scan” for new points to add to their cluster

minPts tends to be a subjective parameter, depending on the number of cases in the data, but the algorithm isn’t very sensitive to small changes. With over 3,000 counties and six features used in the data, we’ll choose minPts = 15. DBSCAN is much more sensitive to the choice of epsilon. So how do we make a good choice? We look at the kNN distance graph to find an elbow:

# Creating a data frame of the knn distance with k = 15
data.frame(
  knn_dist = dbscan::kNNdist(x = counties2, k = 15)
) |> 
  # Arranging each county from shortest distance the largest distance
  arrange(knn_dist) |> 
  # Adding the row number to indicate the order
  mutate(index = row_number()) |> 
  # Creating the line graph of kNN distance
  ggplot(
    mapping = aes(
      x = index,
      y = knn_dist
    )
  ) + 
  geom_line() +
  labs(
    x = NULL,
    y = "kNN Distance"
  ) +
  scale_y_continuous(
    breaks = seq(0, 6, 0.5)
  ) +
  theme_minimal()

The elbow of the graph appears to occur with a kNN distance around 1.25 to 1.5, so we’ll choose epsilon = 1.375

counties_db <- 
  dbscan::dbscan(
    x = counties2,
    eps = 1.375,
    minPts = 15
  )

counties_db
## DBSCAN clustering for 3141 objects.
## Parameters: eps = 1.375, minPts = 15
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 49 noise points.
## 
##    0    1 
##   49 3092 
## 
## Available fields: cluster, eps, minPts, metric, borderPoints

DBSCAN forms one major cluster and leaves 49 counties unclustered (the outliers). This mirrors what we see in the biplots earlier

usmap::plot_usmap(
  regions = "counties",
  data = 
    left_join(
      x = counties |> rownames_to_column(var = "state_county"),
      y = data.frame(
        state_county = row.names(counties2),
        cluster = factor(counties_db$cluster)
      ),
      by = "state_county"
    ),
  values = "cluster",
  linewidth = 0.1
) +
  scale_fill_manual(
    values = c("0" = "tomato", "1" = "steelblue"),
    labels = c("0" = "Outlier", "1" = "Not Outlier"),
    na.translate = F
  ) +

  labs(
    title = "US Counties: DBSCAN"
  )  +
  theme(
    legend.position = "right",
    plot.title = element_text(hjust = 0.5, size = 20, face = "bold")
  ) +
  scale_x_continuous(expand = c(0, 0, 0, 0)) +
  scale_y_continuous(expand = c(0, 0, 0, 0))

The only major take-away is that there are no outlier counties in the eastern states.

Results and Conclusions

Unfortunately, there doesn’t appear to be a strong indication that the counties of the US has natural clusters to them. Going into this analysis, I would have thought that there would be at least two clusters of rural counties with worse economic and lower education features and urban counties with higher economic and education features. While overall this is true for very rural vs very urban counties, there are enough counties in the middle that “bridges” the two sets so there isn’t enough of a separation to identify the two separate clusters.


  1. It’s not an average exactly, but an average of the log of the WSS. See this page for more details.↩︎