Cleaning the education data
education <-
read_xlsx("Education.xlsx", skip = 3) |>
# Making the names R friendly
janitor::clean_names() |>
# Connecticut has missing values for 2022, we'll use the next newest year
mutate(
# # rucc
# x2023_rurual_urban_continuum_code = if_else(
# is.na(x2023_rural_urban_continuum_code),
# x2013_rural_urban_continuum_code,
# x2023_rural_urban_continuum_code
# ),
# No high school
percent_of_adults_with_less_than_a_high_school_diploma_2018_22 = if_else(
is.na(percent_of_adults_with_less_than_a_high_school_diploma_2018_22),
percent_of_adults_with_less_than_a_high_school_diploma_2008_12,
percent_of_adults_with_less_than_a_high_school_diploma_2018_22
),
# high school
percent_of_adults_with_a_high_school_diploma_only_2018_22 = if_else(
is.na(percent_of_adults_with_a_high_school_diploma_only_2018_22),
percent_of_adults_with_a_high_school_diploma_only_2008_12,
percent_of_adults_with_a_high_school_diploma_only_2018_22
),
# some college
percent_of_adults_completing_some_college_or_associates_degree_2018_22 = if_else(
is.na(percent_of_adults_completing_some_college_or_associates_degree_2018_22),
percent_of_adults_completing_some_college_or_associates_degree_2008_12,
percent_of_adults_completing_some_college_or_associates_degree_2018_22
),
# bachelor
percent_of_adults_with_a_bachelors_degree_or_higher_2018_22 = if_else(
is.na(percent_of_adults_with_a_bachelors_degree_or_higher_2018_22),
percent_of_adults_with_a_bachelors_degree_or_higher_2008_12,
percent_of_adults_with_a_bachelors_degree_or_higher_2018_22
)
) |>
# Keeping specific columns and renaming some
dplyr::select(fips = fips_code,
state,
area = area_name,
contains("2023"),
ends_with("22")) |>
# Removing the education counts columns (keep percentage)
dplyr::select(
fips:area,
x2023_rural_urban_continuum_code,
starts_with("percent")
) |>
# Removing states (fips are multiples of 1000)
filter(as.numeric(fips) %% 1000 != 0)
# Changing the columns names
colnames(education) <- c("fips", "state", "county", "rucc",
"no_hs", "high_school", "some_col", "bach")
Loading and initial cleaning of the poverty data
poverty <-
read_xlsx("PovertyEstimates.xlsx", skip = 4) |>
# Making the names R friendly
janitor::clean_names() |>
# Keeping ID columns, poverty percent columns, and median HH income
dplyr::select(fips = fips_code,
state = stabr,
area = area_name,
contains("pct"),
medhhinc = medhhinc_2021) |>
# Removing states (fips are multiples of 1000)
filter(as.numeric(fips) %% 1000 != 0)
# Changing the columns names
colnames(poverty) <- c("fips", "state", "county", "poverty", "minor_poverty",
"child_poverty", "infant_poverty", "medhhinc")
Loading and initial data cleaning the unemployment data
unemploy <-
read_xlsx("Unemployment.xlsx", skip = 4) |>
# Making the names R friendly
janitor::clean_names() |>
# Keeping specific columns and renaming some
dplyr::select(fips = fips_code,
state,
county = area_name,
ends_with("2021"),
ends_with("2022")) |>
# Removing the education counts columns (keep percentage)
dplyr::select(
fips:county,
unemployment = unemployment_rate_2022,
mhhi_per = med_hh_income_percent_of_state_total_2021
) |>
# Removing states (fips are multiples of 1000)
filter(as.numeric(fips) %% 1000 != 0)
Merging the 3 data sets together named counties by fips, state, and county
counties <-
# Merging education and poverty
inner_join(
x = education,
y = poverty |> dplyr::select(-county),
by = c("fips", "state")
) |>
inner_join(
y = unemploy |> dplyr::select(-county),
# Unemploy does county, state for the county column \:<
by = c("fips", "state")
) |>
# Removing Washington DC
filter(state != "DC") |>
# Creating row names: county, state
mutate(county_state = paste0(county, ", ", state)) |>
column_to_rownames(var = "county_state") |>
dplyr::select(-county, -state)
skimr::skim(counties)
Name | counties |
Number of rows | 3141 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 12 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
fips | 0 | 1 | 5 | 5 | 0 | 3141 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
rucc | 8 | 1 | 5.26 | 2.93 | 1.00 | 2.00 | 6.00 | 8.00 | 9.00 | ▆▅▁▅▇ |
no_hs | 0 | 1 | 11.65 | 5.72 | 0.31 | 7.47 | 10.46 | 14.72 | 66.67 | ▇▃▁▁▁ |
high_school | 0 | 1 | 33.95 | 7.55 | 7.13 | 29.11 | 34.25 | 39.37 | 55.68 | ▁▂▇▆▁ |
some_col | 0 | 1 | 30.94 | 5.34 | 3.45 | 27.38 | 30.96 | 34.31 | 80.95 | ▁▇▂▁▁ |
bach | 0 | 1 | 23.46 | 10.01 | 0.00 | 16.51 | 20.94 | 27.86 | 78.87 | ▃▇▂▁▁ |
poverty | 0 | 1 | 14.61 | 5.66 | 2.90 | 10.60 | 13.60 | 17.60 | 43.90 | ▅▇▂▁▁ |
minor_poverty | 0 | 1 | 20.05 | 8.41 | 2.80 | 13.80 | 19.10 | 24.80 | 58.50 | ▅▇▃▁▁ |
child_poverty | 0 | 1 | 18.90 | 8.14 | 2.40 | 12.80 | 17.80 | 23.40 | 61.10 | ▅▇▂▁▁ |
infant_poverty | 3141 | 0 | NaN | NA | NA | NA | NA | NA | NA | |
medhhinc | 0 | 1 | 58931.40 | 15259.34 | 25653.00 | 49014.00 | 56632.00 | 65682.00 | 153716.00 | ▅▇▁▁▁ |
unemployment | 0 | 1 | 3.59 | 1.23 | 0.60 | 2.70 | 3.40 | 4.20 | 14.70 | ▇▇▁▁▁ |
mhhi_per | 0 | 1 | 89.36 | 19.75 | 38.40 | 76.30 | 86.90 | 99.20 | 246.90 | ▅▇▁▁▁ |
Initial take away from the initial formation of the counties data is that the infant poverty rate (0 - 4 years old) is a state-level variable, not a county-level feature, so we’ll remove it during the data preparation step.
# Let's drop the infant poverty column and removing counties with missing values
counties <-
counties |>
dplyr::select(-infant_poverty)
skimr::skim(counties)
Name | counties |
Number of rows | 3141 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
fips | 0 | 1 | 5 | 5 | 0 | 3141 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
rucc | 8 | 1 | 5.26 | 2.93 | 1.00 | 2.00 | 6.00 | 8.00 | 9.00 | ▆▅▁▅▇ |
no_hs | 0 | 1 | 11.65 | 5.72 | 0.31 | 7.47 | 10.46 | 14.72 | 66.67 | ▇▃▁▁▁ |
high_school | 0 | 1 | 33.95 | 7.55 | 7.13 | 29.11 | 34.25 | 39.37 | 55.68 | ▁▂▇▆▁ |
some_col | 0 | 1 | 30.94 | 5.34 | 3.45 | 27.38 | 30.96 | 34.31 | 80.95 | ▁▇▂▁▁ |
bach | 0 | 1 | 23.46 | 10.01 | 0.00 | 16.51 | 20.94 | 27.86 | 78.87 | ▃▇▂▁▁ |
poverty | 0 | 1 | 14.61 | 5.66 | 2.90 | 10.60 | 13.60 | 17.60 | 43.90 | ▅▇▂▁▁ |
minor_poverty | 0 | 1 | 20.05 | 8.41 | 2.80 | 13.80 | 19.10 | 24.80 | 58.50 | ▅▇▃▁▁ |
child_poverty | 0 | 1 | 18.90 | 8.14 | 2.40 | 12.80 | 17.80 | 23.40 | 61.10 | ▅▇▂▁▁ |
medhhinc | 0 | 1 | 58931.40 | 15259.34 | 25653.00 | 49014.00 | 56632.00 | 65682.00 | 153716.00 | ▅▇▁▁▁ |
unemployment | 0 | 1 | 3.59 | 1.23 | 0.60 | 2.70 | 3.40 | 4.20 | 14.70 | ▇▇▁▁▁ |
mhhi_per | 0 | 1 | 89.36 | 19.75 | 38.40 | 76.30 | 86.90 | 99.20 | 246.90 | ▅▇▁▁▁ |
The counties data set has 3141 rows (1 row = 1 county), while the other three full data sets have more rows, indicating that not all rows were retained during the joining process. Let’s check out which counties are “left behind” after our inner joins
education |>
# Looking at rows in education but not in counties
filter(!(fips %in% counties$fips)) |>
count(state) |>
arrange(-n) |>
rename(`drop rows` = n) |>
gt::gt()
state | drop rows |
---|---|
PR | 78 |
CT | 9 |
AK | 8 |
VA | 2 |
DC | 1 |
HI | 1 |
MT | 1 |
The dropped rows in education are mostly from Puerto Rico (PR), Connecticut (CT), and Alaska (AK). Puerto Rico shouldn’t be included since it is not one of the states, but what are the rows in the 50 states?
education |>
# Looking at rows in education but not in counties
filter(
!(fips %in% counties$fips),
state %in% c("AK", "CT")
)
## # A tibble: 17 × 8
## fips state county rucc no_hs high_school some_col bach
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 02010 AK Aleutian Islands NA NA NA NA NA
## 2 02160 AK Kuskokwim Division NA NA NA NA NA
## 3 02201 AK Prince of Wales-Outer Ket… NA NA NA NA NA
## 4 02231 AK Skagway-Yakutat-Angoon Ce… NA NA NA NA NA
## 5 02232 AK Skagway-Hoonah-Angoon Cen… NA NA NA NA NA
## 6 02250 AK Upper Yukon Division NA NA NA NA NA
## 7 02261 AK Valdez-Cordova Census Area NA 10.0 23.6 42.7 23.6
## 8 02280 AK Wrangell-Petersburg Censu… NA NA NA NA NA
## 9 09110 CT Capitol Planning Region, … 1 8.99 25.2 24.7 41.1
## 10 09120 CT Greater Bridgeport Planni… 2 11.7 25.2 21.4 41.7
## 11 09130 CT Lower Connecticut River V… 1 4.73 23.8 26.2 45.3
## 12 09140 CT Naugatuck Valley Planning… 2 9.65 30.1 27.0 33.3
## 13 09150 CT Northeastern Connecticut … 4 8.72 34.8 31.4 25.0
## 14 09160 CT Northwest Hills Planning … 4 6.81 28.6 26.7 37.9
## 15 09170 CT South Central Connecticut… 2 8.57 28.4 22.6 40.4
## 16 09180 CT Southeastern Connecticut … 2 7.61 29.6 29.0 33.8
## 17 09190 CT Western Connecticut Plann… 2 8.36 18.5 18.9 54.2
For Alaska and Connecticut, the rows correspond to not counties, but either census areas (AK) and planning regions (CT). Since they don’t represent counties, dropping them is correct. What about Virginia (VA), Hawaii (HI) and Montana (MT)?
education |>
# Looking at rows in education but not in counties
filter(
!(fips %in% counties$fips),
state %in% c("VA", "HI", "MT")
)
## # A tibble: 4 × 8
## fips state county rucc no_hs high_school some_col bach
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 15005 HI Kalawao County 3 8 34 24 34
## 2 30113 MT Yellowstone National Park NA NA NA NA NA
## 3 51515 VA Bedford city NA 15.2 33.1 30.4 21.3
## 4 51560 VA Clifton Forge city NA NA NA NA NA
For the two cities in Virginia, they used to be independent cities and not considered part of a county, but are now incorporated into counties, so we’ll keep excluding them. Yellowstone National Park is also not a county, but a national park (as the name suggests). Kalawao County, Hawaii is the smallest county in the country by land area, has a population of 82 (as of 2020), doesn’t have an elected government, and is a judicial district of Maui county.
All of the excluded counties in the education data set are appropriately left out of the full data set. But what about the two in the poverty data set that are excluded?
poverty |>
# Looking at rows in education but not in counties
filter(!(fips %in% counties$fips))
## # A tibble: 2 × 8
## fips state county poverty minor_poverty child_poverty infant_poverty medhhinc
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11001 DC Distr… 16.8 25.5 25.5 NA 91072
## 2 15005 HI Kalaw… NA NA NA NA NA
One is the District of Columbia (not a county), and the other is the Hawaiian county we saw in the education data!
unemploy |>
# Looking at rows in unemploy but not in counties
filter(!(fips %in% counties$fips)) |>
count(state)
## # A tibble: 3 × 2
## state n
## <chr> <int>
## 1 AK 4
## 2 DC 1
## 3 PR 78
It appears that we have the same non-counties as the education data (DC and PR), and likely the same problem with Alaska, but let’s check Alaska again since there are only 4 rows not included instead of 8, just in case there is something new
unemploy |>
# Looking at rows in unemploy but not in counties
filter(
!(fips %in% counties$fips),
state == "AK"
)
## # A tibble: 4 × 5
## fips state county unemployment mhhi_per
## <chr> <chr> <chr> <dbl> <dbl>
## 1 02201 AK Prince of Wales-Outer Ketchikan Census Area… NA NA
## 2 02232 AK Skagway-Hoonah-Angoon Census Area, AK NA NA
## 3 02261 AK Valdez-Cordova Census Area, AK NA NA
## 4 02280 AK Wrangell-Petersburg Census Area, AK NA NA
Nothing new, same rows as the education data set, just fewer census areas
Let’s create a set of box plots for each of the percentage columns to get an idea of the range of each feature:
boxplot_counties <-
counties |>
rename(
`No High school` = no_hs,
`High School Only` = high_school,
`Some College` = some_col,
`Bachelor or Higher` = bach,
Poverty = poverty,
`Minor Poverty` = minor_poverty,
`Child Poverty` = child_poverty,
Unemployment = unemployment,
`Median Home Income (Relavetive)` = mhhi_per
) |>
pivot_longer(
cols = c(-fips, -rucc, -medhhinc),
names_to = "feature",
values_to = "percentage"
) |>
ggplot(
mapping = aes(
x = percentage/100
)
) +
geom_boxplot(
fill = "steelblue"
) +
labs(
x = NULL,
title = "US County Features"
) +
theme_bw() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(hjust = 0.5),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
boxplot_counties +
# Changing x-axis to a percentage
scale_x_continuous(labels = scales::label_percent()) +
# a plot per feature
facet_wrap(
facets = vars(as_factor(feature)),
#scales = "free"
)
Since many features have large outliers, we need a method of rescaling the data that is not sensitive to the large outliers. We’ll use robust scaling, which is similar to standardization, but replaces the mean and standard deviation with the median and IQR:
\[z_i = \frac{X_i - \textrm{median}(X)}{\textrm{IQR}(X)}\]
Robust scaling maintains the large values while not compressing the remaining values like standardization or MinMax would.
For now, let’s create a correlation plot for our percentage variables. We don’t need to rescale the data first since correlation is unaffected by linear transformations (like robust rescaling).
counties |>
dplyr::select(
no_HS =no_hs, hs = high_school, some_col:poverty,
minor_pov = minor_poverty, child_pov = child_poverty,
unemploy = unemployment, mhhi_per
) |>
GGally::ggcorr(
low = "tomato",
mid = "white",
high = "steelblue",
label = T,
label_round = 2,
hjust = 0.65
) +
theme_void()
There is a large amount of correlation in the data, which isn’t surprising since several variables measure similar characteristics, like the set of education features and the set of poverty variables. Specifically, the 3 poverty variables have very, very high pairwise correlations from 0.93 to 0.99 (!). We’ll likely need to remove some of the redundant, highly correlated variables. Before we do, let’s use robust scaling to place the variables on a similar scale.
counties_res <-
counties |>
# Using robust scaling with mutate and across
mutate(
across(
.cols = c(no_hs:child_poverty, unemployment:mhhi_per),
.fns = ~ (. - median(.)) / IQR(.)
)
) |>
# Dropping median household income, using relative mhhi instead
dplyr::select(-medhhinc)
# Checking that the data have been scaled properly: Med = 0 and IQR = 1
counties_res |>
pivot_longer(
cols = c(no_hs:mhhi_per),
names_to = "features",
values_to = "percent"
) |>
# Five number summary for each feature
summarize(
.by = features,
min = min(percent),
Q1 = quantile(percent, probs = 0.25),
median = median(percent),
Q3 = quantile(percent, probs = 0.25),
max = max(percent),
IQR = IQR(percent)
)
## # A tibble: 9 × 7
## features min Q1 median Q3 max IQR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 no_hs -1.40 -0.412 0 -0.412 7.74 1
## 2 high_school -2.64 -0.501 0 -0.501 2.09 1
## 3 some_col -3.97 -0.517 0 -0.517 7.21 1
## 4 bach -1.84 -0.390 0 -0.390 5.10 1
## 5 poverty -1.53 -0.429 0 -0.429 4.33 1
## 6 minor_poverty -1.48 -0.482 0 -0.482 3.58 1
## 7 child_poverty -1.45 -0.472 0 -0.472 4.08 1
## 8 unemployment -1.87 -0.467 0 -0.467 7.53 1
## 9 mhhi_per -2.12 -0.463 0 -0.463 6.99 1
We can check the boxplots and see if they are all on a similar scale
(boxplot_counties +
scale_x_continuous() +
facet_wrap(
facets = vars(as_factor(feature)),
#scales = "free"
)) %+%
# Replacing the data set in the original graph
(
counties_res |>
rename(
`No High school` = no_hs,
`High School Only` = high_school,
`Some College` = some_col,
`Bachelor or Higher` = bach,
Poverty = poverty,
`Minor Poverty` = minor_poverty,
`Child Poverty` = child_poverty,
Unemployment = unemployment,
`Median Home Income (Relavetive)` = mhhi_per
) |>
pivot_longer(
cols = c(-rucc, -fips),
names_to = "feature",
values_to = "percentage"
) |>
# Need to multiply percentage by 100 to undo the divide by 100 in ggplot()
mutate(percentage = percentage * 100)
)
As mentioned previously, linear transformations (like robust scaling) don’t affect the correlations, which we can check by creating the correlation plot again
counties_res |>
dplyr::select(
no_HS =no_hs, hs = high_school, some_col:poverty,
minor_pov = minor_poverty, child_pov = child_poverty,
unemploy = unemployment, mhhi_per
) |>
GGally::ggcorr(
low = "tomato",
mid = "white",
high = "steelblue",
label = T,
label_round = 2,
hjust = 0.65
) +
theme_void()
Same graph as earlier!
Multicollinearity in a data set occurs when the explanatory variables are either perfectly or strongly predictive of one another. While this may sound like a positive characteristic, we only want variables to be predictive of a response variable, not predictive of each other.
One tool to check for multicollinearity is principal component analysis (PCA), which “rotates” the data along its primary axis, creating a set of uncorrelated variables. The main idea is that by uncorrelating the data, the last few principal components (PCs) should be very small since they shouldn’t contain much information.
We can use PCA to check for multicollinearity by examining the last few PCs. If they make up 0% or approximately 0% of the variation in the data, that indicates there are either exact or near exact linear relationships. We can then use the PC to find which sets features are strongly related and how they are related. The big advantage of PCA over a correlation plot is that it can find linear associations between any number of features, not just two at a time!
counties_pca <-
prcomp(x = counties_res |> dplyr::select(-fips, -rucc))
# Creating a scree plot
factoextra::fviz_screeplot(
counties_pca,
geom = "bar",
addlabels = T,
hjust = 0.5
) +
scale_y_continuous(
breaks = seq(0, 100, 10),
labels = ~ paste0(.x, "%"),
minor_breaks = NULL,
expand = c(0, 0, 0.05, 0)
)
The screeplot above shows the percentage each PC contributes to the overall variance (size) of the data. The second to last and last PCs contribute very little and none, respectively, to the variance of the original data, indicating that there is an exact and near exact linear association between our 9 features. The third to last PC has a very small percentage of the variance, so we’ll look at it as well.
To check which sets of variables are involved in the linear association, we look at the columns of rotation matrix that correspond to the last three PCs. If the rotation is near 0 or negligible, the feature is not part of the linear association. If the rotation is not negligible, then it is an important part of the linear association.
To make it easier to read, we’ll create a graph
gg_pc_rot <-
counties_pca |>
pluck("rotation") |>
data.frame() |>
# Picking the last 3 PCs
dplyr::select(PC7:PC9) |>
# Changing the features from row names to a column
rownames_to_column(var = "feature") |>
pivot_longer(
cols = -feature,
names_to = "PC",
values_to = "loading"
) |>
# Better names for the features for the graph
mutate(
feature = case_when(
feature == "no_hs" ~ "No High School",
feature == "high_school" ~ "High School Only",
feature == "some_col" ~ "Some College",
feature == "bach" ~ "Bachelor+",
feature == "poverty" ~ "Poverty",
feature == "minor_poverty" ~ 'Minor Poverty',
feature == "child_poverty" ~ 'Child Poverty',
feature == "unemployment" ~ 'Unemployment',
feature == "mhhi_per" ~ 'Median Home Income\n(Relavetive)'
)
) |>
# Creating a tile plot
ggplot(
mapping = aes(
x = PC,
y = as_factor(feature),
fill = abs(loading),
label = round(loading, 3)
)
) +
geom_tile(
color = "white"
) +
# Add the loading value to the tile
ggfittext::geom_fit_text(
contrast = T
) +
# Changing to a white (low) to blue (high) gradient
scale_fill_gradient(
low = "white",
high = "steelblue",
limits = c(0, 1)
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold")
) +
# Removing the buffer space
coord_cartesian(expand = F) +
# Changing the labels
labs(
title = "Displaying Which Features are Linearly Associated",
x = NULL,
y = NULL
)
gg_pc_rot
The last PC (PC9) is a combination of the four education variables, indicating that at least one is redundant. Since these four features are percentages of a set of complete and disjoint categories (every person falls into one and only one category), the percentages have to add up to 100%. Therefore, if we know three of the education variables, we also know the fourth, meaning we can remove one from our data.
PCs 7 and 8 involve the 3 poverty percentages, which we saw were strongly associated in the correlation plot. We can drop one or two of these features as well.
We’ll start by removing bachelor degree and child poverty, then redoing PCA to see if another poverty variable should be removed. We’re removing bachelor degree because it has a high correlation with high school only and child poverty because it’s the least reported of the three poverty percentages.
counties_pca2 <-
counties_res |>
dplyr::select(-rucc, -bach, -child_poverty, -fips) |>
prcomp()
# Scree plot
factoextra::fviz_screeplot(
counties_pca2,
geom = "bar",
addlabels = T,
hjust = 0.5
) +
scale_y_continuous(
breaks = seq(0, 100, 10),
labels = ~ paste0(.x, "%"),
minor_breaks = NULL,
expand = c(0, 0, 0.05, 0)
)
The last PC accounts for less than 1% of the overall variance in the data, so we’ll remove another feature. While our previous results indicate that it is likely one of the poverty variables, we’ll create a similar tile plot to confirm.
(gg_pc_rot) %+%
# Adding just the loadings of the 7th PC
(
counties_pca2 |>
pluck("rotation") |>
data.frame() |>
# Picking the last 3 PCs
dplyr::select(PC7) |>
# Changing the features from row names to a column
rownames_to_column(var = "feature") |>
pivot_longer(
cols = -feature,
names_to = "PC",
values_to = "loading"
) |>
# Better names for the features for the graph
mutate(
feature = case_when(
feature == "no_hs" ~ "No High School",
feature == "high_school" ~ "High School Only",
feature == "some_col" ~ "Some College",
feature == "bach" ~ "Bachelor+",
feature == "poverty" ~ "Poverty",
feature == "minor_poverty" ~ 'Minor Poverty',
feature == "child_poverty" ~ 'Child Poverty',
feature == "unemployment" ~ 'Unemployment',
feature == "mhhi_per" ~ 'Median Home Income\n(Relavetive)'
)
)
)
As expected, there is still a strong linear association between minor poverty and poverty, so we’ll drop minor poverty. Why we’re dropping minor poverty instead of poverty is due to poverty percentage being a more widely reported figure than minor poverty, so it is more easily found for future analysis.
A second way we can check for linear associations is with the variance inflation factor (VIF). How VIF works is it creates a set of linear models, treating each variable as a response and the remaining features as explanatory variables. From each model, the \(R^2\) value is calculated and the VIF for variable \(i\) is
\[\textrm{VIF}_i = \frac{1}{1 - R^2_i}\]
We consider a feature to have a high VIF if it is above 5 and a very high VIF if it is above 10. Let’s check the VIF for our full set of features and our reduced set of features
# All features
VIF_full <-
counties_res |>
# Since the edu vars are exactly dependent, we'll add a small value to bach
# That way we can calculate a VIF for all the variables
mutate(bach = bach + runif(n = nrow(counties_res), -0.01, 0.01)) |>
# Forming the linear model to predict rucc (response is not important)
lm(formula = rucc ~ . - fips) |>
regclass::VIF() |>
data.frame() |>
rownames_to_column(var = "feature")
# No bach or child_poverty
VIF_partial <-
counties_res |>
lm(formula = rucc ~ . - bach - child_poverty - fips) |>
regclass::VIF() |>
data.frame() |>
rownames_to_column(var = "feature")
# No bach and only poverty
VIF_simple <-
counties_res |>
lm(formula = rucc ~ . - bach - child_poverty - minor_poverty - fips) |>
regclass::VIF() |>
data.frame() |>
rownames_to_column(var = "feature")
# Displaying the results
VIF_results <-
left_join(
x = VIF_full,
y = VIF_partial,
by = "feature"
) |>
left_join(
y = VIF_simple,
by = "feature"
) |>
mutate(
across(
.cols = -feature,
.fns = ~ round(., 1)
),
feature = case_when(
feature == "no_hs" ~ "No High School",
feature == "high_school" ~ "High School Only",
feature == "some_col" ~ "Some College",
feature == "bach" ~ "Bachelor+",
feature == "poverty" ~ "Poverty",
feature == "minor_poverty" ~ 'Minor Poverty',
feature == "child_poverty" ~ 'Child Poverty',
feature == "unemployment" ~ 'Unemployment',
feature == "mhhi_per" ~ 'Rel Median Home Income'
)
)
colnames(VIF_results) <- c("Feature", "VIF1", "VIF2", "VIF3")
VIF_results |>
gt::gt() |>
gt::sub_missing(
columns = 2:4,
missing_text = ""
) |>
gt::cols_label(
VIF1 = "Full Model",
VIF2 = "Middle Model",
VIF3 = "Simple Model"
) |>
gt::tab_spanner(
label = "VIF",
columns = -Feature
)
Feature | VIF | ||
---|---|---|---|
Full Model | Middle Model | Simple Model | |
No High School | 7599.2 | 2.0 | 1.9 |
High School Only | 13218.7 | 1.4 | 1.3 |
Some College | 6605.5 | 1.4 | 1.4 |
Bachelor+ | 23203.9 | ||
Poverty | 9.0 | 9.0 | 2.6 |
Minor Poverty | 85.5 | 9.5 | |
Child Poverty | 75.1 | ||
Unemployment | 1.3 | 1.3 | 1.3 |
Rel Median Home Income | 2.2 | 2.2 | 2.2 |
The results using VIF agree with our findings from PCA. For the full model, the four education variables all have VIF in the thousands, witch bachelor being over 23,000! The three poverty features also have high VIFs, especially minor and child poverty. For the middle set of features, there is some concern about MCL since the VIF for poverty and minor poverty are near 10. The set of six features: no high school, high school only, some college, poverty, unemployment, and relative median household income percent, do not have a VIF above 3, indicating no concern about linear associations!
The most common method to perform clustering is with k-means clustering. The big advantage of k-means clustering is that we only need to specify one parameter for the algorithm, \(k\). But what is \(k\)? It’s the different number of clusters we want the algorithm to group the cases (counties) in. The big drawback of k-means is that we need to specify the number of clusters, which is often one of the major points of interest in clustering analysis.
The two other drawbacks of k-means are:
The method uses centroids (a p-dimensional vector of the averages for the features), which is susceptible to outliers, which are prevalent in our data.
Each point (county) will be in one of the \(k\) clusters that are specified, so clusters can include outliers.
To combat outliers in the data, we will use k-medoids instead, which uses medians instead of means to form the center of the clusters. The drawback of k-medoids is that it is much more computationally expensive to calculate a p-dimensional median than a p-dimensional mean, so the algorithm runs significantly slower during each iteration.
k-medoids also has the drawback is needing to specify the choice of \(k\). So how do we determine a good choice for \(k\)?
Often, determining the number of clusters is done visually. If we only have two features, we can create a scatterplot and look to see if there are obvious groups in the plot. If there are more than two features, we can create a biplot: a scatterplot of a pair of principal components. Often just the first and second principal components are used, but we could look at other sets as well, depending on how much of the variance is preserved in the data.
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.355 0.826 0.715 0.635 0.475 0.414
## Proportion of Variance 0.479 0.178 0.133 0.105 0.059 0.045
## Cumulative Proportion 0.479 0.658 0.791 0.896 0.955 1.000
From the results of PCA, we’ll look at the pairwise plots of the first four PCs
From the pairwise scatterplots, there doesn’t appear to be an obvious, natural clusters, but there does appear to be several outlier counties!
Another way of visually assessing how many clusters exist in the data
is with an elbow plot. An elbow plot displays the
“Within Sums of Squares” (WSS) on the y-axis (the total spread of the
clusters) and the number of clusters on the x-axis. So it performs
clustering from k = 1
up to k = k_max
(decided
by us) clusters and displays the WSS:
\[WSS = \sum_i\sum_j(y_{ij} - \bar{y}_i)^2\]
where \(i\) indicates the cluster and \(j\) indicates the \(j\)th observation in the \(i\)th cluster. It’s sort-of like a variance, controlling for the cluster each point is in, and not dividing by \(n - 1\). Lower cluster areas (smaller variance) indicates that there are better defined clusters in the data.
The downside of using WSS to judge how many clusters there are is as \(k\) increases, WSS always decreases. So we can’t just use the choice of \(k\) that makes WSS the smallest. Instead, we look for an “elbow” in the plot: where increasing from \(k\) to \(k + 1\) no longer causes a noticeable drop in WSS.
factoextra::fviz_nbclust(
x = counties2,
# Specifying to use k-mediods (pam() in the cluster package)
FUNcluster = cluster::pam,
method = "wss"
)
From the elbow plot, there is a noticeable drop in WSS from \(k = 1\) to \(k = 2\), then small drops for each proceeding increase in \(k\), indicating two clusters is the best choice. But the choice of \(k\) from an elbow plot can be subjective. So do we have more objective methods?
A silhouette plot is similar to an elbow plot, in that is has the number of clusters on the x-axis and a metric to judge how well the data are clustered on the y-axis, called the silhouette score. So what is the silhouette score? It’s a little difficult to explain fully here, but it compares the cluster that point \(i\) was placed in to the “next best cluster” that it could have been clustered in. If the current cluster is a much better choice, then case \(i\) will have a large silhouette score, close to one. If the appropriateness of the current and next-best cluster are about the same, then the score will be near zero. The silhouette score displayed on a silhouette plot is the average of every case’s silhouette score.
The big advantage is that, unlike an elbow plot, the silhouette score isn’t monotonic (doesn’t always decrease) in \(k\) but can increase and decrease as \(k\) changes. We can use the choice of \(k\) that maximizes the silhouette score to decide what the best choice of \(k\) would be.
factoextra::fviz_nbclust(
x = counties2,
# Specifying to use k-medoids
FUNcluster = cluster::pam,
method = "silhouette"
)
The silhouette plot above agrees with our conclusion from the elbow plot that the best choice is two clusters. We’ll look at one final graph to help decide on the best choice of \(k\).
The gap statistic uses a simulated data set that is similar to the original data in dimension and spread, but has no true clusters. The simulated (fake) data are clustered using the same method and clusters as the real data, and the WSS are calculated for each choice of \(k\). This is repeated \(B\) times (usually 100 or 500 times), and the average1 WSS of these fake data is calculated for each choice of \(k\). The difference between the average WSS for a data set with no clusters is compared to the WSS for the real data. We can see an “improvement” in the real WSS when a new cluster is found by increasing \(k\), where as if all the clusters in the real data are already accounted for, the change in the WSS between the real and fake data should be similar.
The Gap Statistic recommends 5 clusters, which is quite a few more than the previous two methods. So which should we choose?
We can look at many different methods to help decide the best number of clusters, which we’ll do in the next section:
The NBClust()
function in the NBClust
package evaluates the best choice of \(k\) using 30 different metrics and saves
the results. The code chunk below visualizes the results with a bar
chart of how often each method chooses a certain value of k
k_result <- NbClust::NbClust(data = counties2, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 9 as the best number of clusters
## * 1 proposed 11 as the best number of clusters
## * 3 proposed 13 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# Displaying the choices
k_result$Best.nc |>
t() |>
data.frame() |>
count(Number_clusters) |>
ggplot(
mapping = aes(
x = Number_clusters,
y = n
)
) +
geom_col(
fill = "steelblue",
color = "black"
) +
geom_text(
mapping = aes(label = n),
vjust = -0.5
) +
theme_classic() +
labs(x = "Best choice of k",
y = "Number of metrics") +
scale_y_continuous(
expand = c(0, 0, 0.05, 0),
breaks = 0:7*2
) +
scale_x_continuous(breaks = 0:15)
Eight metrics chose \(k = 2\) and six chose \(k = 3\), so we’ll start k-medoids with \(k = 2\) and after see how \(k = 3\) clusters the data.
set.seed(1234)
counties_km2 <-
cluster::pam(
x = counties2,
k = 2
)
fviz_cluster(
counties_km2,
geom = "point",
#ellipse = F
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "none"
) +
scale_fill_manual(
values = c("1" = "tomato", "2" = "steelblue")
) +
scale_color_manual(
values = c("1" = "tomato", "2" = "steelblue")
)
There doesn’t appear to be too much of an obvious difference in the plot above using the first two PCs, just dividing a large cluster in two. Six methods recommended three clusters, only two fewer methods than what suggested two clusters? What does our results look like if \(k = 3\)?
set.seed(1234)
counties_km3 <-
cluster::pam(
x = counties2,
k = 3
)
fviz_cluster(
counties_km3,
geom = "point",
#ellipse = F
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "none"
) +
scale_fill_manual(
values = c("1" = "tomato", "2" = "steelblue", "3" = "orange2")
) +
scale_color_manual(
values = c("1" = "tomato", "2" = "steelblue", "3" = "orange2")
)
Like with \(k = 2\), \(k = 3\) just divides the large cluster into three groups.
Let’s compare the medoids of the two cluster and three cluster results
counties[
# Keeping the rows that correspond to the medoid of each cluster
rownames(counties) %in%
c(counties_km2$medoids |>
data.frame() |>
rownames()),
] |>
# Dropping three irrelevant columns
dplyr::select(-fips, -rucc, -medhhinc) |>
round(digits = 1) |>
# Moving the counties to a column
rownames_to_column(var = "County") |>
# Creating a graphic table with a title and subtitle
gt::gt() |>
gt::tab_header(
title = gt::md("Medoid County Results"),
subtitle = gt::md("Two clusters")
)
Medoid County Results | |||||||||
Two clusters | |||||||||
County | no_hs | high_school | some_col | bach | poverty | minor_poverty | child_poverty | unemployment | mhhi_per |
---|---|---|---|---|---|---|---|---|---|
Clark County, IN | 9.5 | 33.3 | 32.6 | 24.6 | 10.4 | 13.9 | 13.5 | 2.7 | 100.2 |
Cherokee County, SC | 14.8 | 37.7 | 30.3 | 17.2 | 18.8 | 28.7 | 27.4 | 4.1 | 77.1 |
Clark County, IN and Cherokee County, SC are the median counties for the two clusters. One cluster (Clark) has a higher college participation percentage (some college and bachelor), lower poverty and unemployment rates, and a higher relative median household income percentage than the other cluster (Cherokee).
The two clusters appear to be divided by education and economic percentage: Higher college and income vs lower college percentage and income
Let’s compare the same for three clusters
counties[
# Keeping the rows that correspond to the medoid of each cluster
rownames(counties) %in%
c(counties_km3$medoids |>
data.frame() |>
rownames()),
] |>
# Dropping three irrelevant columns
dplyr::select(-fips, -rucc, -medhhinc) |>
round(digits = 1) |>
# Moving the counties to a column
rownames_to_column(var = "County") |>
# Creating a graphic table with a title and subtitle
gt::gt() |>
gt::tab_header(
title = gt::md("Medoid County Results"),
subtitle = gt::md("Three clusters")
)
Medoid County Results | |||||||||
Three clusters | |||||||||
County | no_hs | high_school | some_col | bach | poverty | minor_poverty | child_poverty | unemployment | mhhi_per |
---|---|---|---|---|---|---|---|---|---|
Searcy County, AR | 18.0 | 39.1 | 27.9 | 14.9 | 20.9 | 32.0 | 30.0 | 4.5 | 67.4 |
Marion County, FL | 10.8 | 35.1 | 32.1 | 22.0 | 13.6 | 20.7 | 19.2 | 3.4 | 86.5 |
Lee County, GA | 8.1 | 27.2 | 32.2 | 32.6 | 9.8 | 14.8 | 13.3 | 2.9 | 115.1 |
A similar result three clusters, just low, medium, and high groups for education and economic features.
Let’s visualize the counties clustering results
usmap::plot_usmap(
regions = "counties",
data = counties |>
add_column(cluster = factor(counties_km2$clustering)),
values = "cluster",
linewidth = 0.1
) +
scale_fill_manual(
values = c("tomato", "steelblue"),
labels = c("High College & Income", "Low College & Income")
) +
# Adding the state outlines
# geom_polygon(
# data = state_map$data,
# mapping = aes(x = x, y = y, group = group),
# fill = NA,
# color = "black",
# linewidth = 0.35
# ) +
labs(
title = "US Counties: Two Clusters"
) +
theme(
legend.position = "none",
#legend.position = "top",
plot.title = element_text(hjust = 0.5, size = 20, face = "bold")
) +
scale_x_continuous(expand = c(0, 0, 0, 0)) +
scale_y_continuous(expand = c(0, 0, 0, 0))
usmap::plot_usmap(
regions = "counties",
data =
# Adding the cluster result to the counties data set
left_join(
x = counties |> rownames_to_column(var = "state_county"),
y = data.frame(
state_county = names(counties_km3$clustering),
cluster = factor(counties_km3$clustering)
),
by = "state_county"
),
values = "cluster",
linewidth = 0.1
) +
# Having the colors match the previous results and add orange2 for cluster 3
scale_fill_manual(
values = c("1" = "tomato", "2" = "steelblue", "3" = "orange2")
) +
# Adding a title
labs(
title = "US Counties: Three Clusters"
) +
# Removing the legend and centering the title
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5, size = 20, face = "bold")
) +
# Removing the buffer space
scale_x_continuous(expand = c(0, 0, 0, 0)) +
scale_y_continuous(expand = c(0, 0, 0, 0))
Neither of the maps lend much insight to the clustering of the counties that isn’t already present.
Another drawback of k-means/medoids is that every county will be included in one of the specified cluster, even when the county is very different from every other county. We’d like to use a method that won’t cluster a county if it is an outlier. What clustering method allows for outliers?
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a different clustering algorithm that depends more on the distance each point is away from the closest, unclustered point.
To use DBSCAN, we need to specify two parameters:
minPts = minimum number of cases needed to start a new cluster
epsilon = the distance each case will “scan” for new points to add to their cluster
minPts tends to be a subjective parameter, depending on the number of cases in the data, but the algorithm isn’t very sensitive to small changes. With over 3,000 counties and six features used in the data, we’ll choose minPts = 15. DBSCAN is much more sensitive to the choice of epsilon. So how do we make a good choice? We look at the kNN distance graph to find an elbow:
# Creating a data frame of the knn distance with k = 15
data.frame(
knn_dist = dbscan::kNNdist(x = counties2, k = 15)
) |>
# Arranging each county from shortest distance the largest distance
arrange(knn_dist) |>
# Adding the row number to indicate the order
mutate(index = row_number()) |>
# Creating the line graph of kNN distance
ggplot(
mapping = aes(
x = index,
y = knn_dist
)
) +
geom_line() +
labs(
x = NULL,
y = "kNN Distance"
) +
scale_y_continuous(
breaks = seq(0, 6, 0.5)
) +
theme_minimal()
The elbow of the graph appears to occur with a kNN distance around 1.25 to 1.5, so we’ll choose epsilon = 1.375
counties_db <-
dbscan::dbscan(
x = counties2,
eps = 1.375,
minPts = 15
)
counties_db
## DBSCAN clustering for 3141 objects.
## Parameters: eps = 1.375, minPts = 15
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 49 noise points.
##
## 0 1
## 49 3092
##
## Available fields: cluster, eps, minPts, metric, borderPoints
DBSCAN forms one major cluster and leaves 49 counties unclustered (the outliers). This mirrors what we see in the biplots earlier
usmap::plot_usmap(
regions = "counties",
data =
left_join(
x = counties |> rownames_to_column(var = "state_county"),
y = data.frame(
state_county = row.names(counties2),
cluster = factor(counties_db$cluster)
),
by = "state_county"
),
values = "cluster",
linewidth = 0.1
) +
scale_fill_manual(
values = c("0" = "tomato", "1" = "steelblue"),
labels = c("0" = "Outlier", "1" = "Not Outlier"),
na.translate = F
) +
labs(
title = "US Counties: DBSCAN"
) +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5, size = 20, face = "bold")
) +
scale_x_continuous(expand = c(0, 0, 0, 0)) +
scale_y_continuous(expand = c(0, 0, 0, 0))
The only major take-away is that there are no outlier counties in the eastern states.
Unfortunately, there doesn’t appear to be a strong indication that the counties of the US has natural clusters to them. Going into this analysis, I would have thought that there would be at least two clusters of rural counties with worse economic and lower education features and urban counties with higher economic and education features. While overall this is true for very rural vs very urban counties, there are enough counties in the middle that “bridges” the two sets so there isn’t enough of a separation to identify the two separate clusters.