install.packages("tidyverse")
install.packages(c("arules","arulesViz"))
library(tidyverse)
library(readr)
library(psych)
library(factoextra)
library(cluster)
library(arules)
library(arulesViz)
The dataset used in this project is derived from the 2010 Small Area Health Insurance Estimates (SAHIE) program, published by the United States Census Bureau.
The dataset provides county-level and state-level estimates of:
Number of insured and uninsured individuals.
Insurance coverage rates
Demographic breakdowns
Geographic identifiers
df_raw <- read.csv("C:/Users/ASUS-PC/Desktop/UL_Project/sahie2010.csv",
skip = 3,
header = TRUE,
sep = ",",
stringsAsFactors = FALSE
)
df_raw$stcou <- gsub('^="|"$', "", df_raw$stcou)
df_raw$name <- trimws(df_raw$name)
dim(df_raw)
## [1] 216790 18
names(df_raw)
## [1] "year" "stcou" "geocat" "agecat" "racecat" "sexcat"
## [7] "iprcat" "name" "nipr" "nipr_moe" "nui" "nui_moe"
## [13] "nic" "nic_moe" "pctui" "pctui_moe" "pctic" "pctic_moe"
str(df_raw)
## 'data.frame': 216790 obs. of 18 variables:
## $ year : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ stcou : chr "=01000" "=02000" "=04000" "=05000" ...
## $ geocat : int 40 40 40 40 40 40 40 40 40 40 ...
## $ agecat : int 0 0 0 0 0 0 0 0 0 0 ...
## $ racecat : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sexcat : int 0 0 0 0 0 0 0 0 0 0 ...
## $ iprcat : int 0 0 0 0 0 0 0 0 0 0 ...
## $ name : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ nipr : chr "4021188" "647241" "5399995" "2431331" ...
## $ nipr_moe : chr "N/A" "N/A" "N/A" "N/A" ...
## $ nui : chr "681437" "138777" "1042809" "500134" ...
## $ nui_moe : chr "14309" "5722" "21121" "11712" ...
## $ nic : chr "3339750" "508464" "4357186" "1931198" ...
## $ nic_moe : chr "14309" "5722" "21121" "11712" ...
## $ pctui : chr "16.9" "21.4" "19.3" "20.6" ...
## $ pctui_moe: chr "0.4" "0.9" "0.4" "0.5" ...
## $ pctic : chr "83.1" "78.6" "80.7" "79.4" ...
## $ pctic_moe: chr "0.4" "0.9" "0.4" "0.5" ...
summary(df_raw)
## year stcou geocat agecat
## Min. :2010 Length:216790 Min. :40.00 Min. :0.000
## 1st Qu.:2010 Class :character 1st Qu.:50.00 1st Qu.:1.000
## Median :2010 Mode :character Median :50.00 Median :2.000
## Mean :2010 Mean :49.42 Mean :1.684
## 3rd Qu.:2010 3rd Qu.:50.00 3rd Qu.:3.000
## Max. :2010 Max. :50.00 Max. :4.000
## racecat sexcat iprcat name
## Min. :0.00000 Min. :0.0000 Min. :0 Length:216790
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1 Class :character
## Median :0.00000 Median :1.0000 Median :2 Mode :character
## Mean :0.08469 Mean :0.9263 Mean :2
## 3rd Qu.:0.00000 3rd Qu.:2.0000 3rd Qu.:3
## Max. :3.00000 Max. :2.0000 Max. :4
## nipr nipr_moe nui nui_moe
## Length:216790 Length:216790 Length:216790 Length:216790
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## nic nic_moe pctui pctui_moe
## Length:216790 Length:216790 Length:216790 Length:216790
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## pctic pctic_moe
## Length:216790 Length:216790
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# 删除包含 "moe" 的列
moe_cols <- grep("moe", names(df_raw), ignore.case = TRUE)
df_clean <- df_raw[, -moe_cols]
# 删除指定列(如果存在)
drop_cols <- c("nipr", "nic", "nui", "year")
drop_exist <- drop_cols[drop_cols %in% names(df_clean)]
df_clean <- df_clean[, !(names(df_clean) %in% drop_exist)]
# 检查分类变量
table(df_clean$geocat)
##
## 40 50
## 12495 204295
table(df_clean$iprcat)
##
## 0 1 2 3 4
## 43358 43358 43358 43358 43358
table(df_clean$racecat)
##
## 0 1 2 3
## 207610 3060 3060 3060
table(df_clean$agecat)
##
## 0 1 2 3 4
## 50205 50205 50205 50205 15970
table(df_clean$sexcat)
##
## 0 1 2
## 82910 66940 66940
df_state_raw <- subset(df_clean, geocat == 40)
df_county_raw <- subset(df_clean, geocat == 50)
statement:
- overall:All classifications are fixed at 0.
income:Set age/race/sex to 0, and only allow the iprcat (1 vs 4) to change.
race:Set age/sex to 0 and iprcat to 0, and only allow the racecat (1 vs 2) to change.
gap:income_gap and race_gap.
state_overall <- subset(
df_state_raw,
agecat == 0 & racecat == 0 & sexcat == 0 & iprcat == 0,
select = c("name", "pctui")
)
names(state_overall)[2] <- "overall_uninsured"
state_income_raw <- subset(
df_state_raw,
agecat == 0 & racecat == 0 & sexcat == 0 & iprcat %in% c(1,4),
select = c("name", "iprcat", "pctui")
)
state_income_wide <- reshape(
state_income_raw,
idvar = "name",
timevar = "iprcat",
direction = "wide"
)
# 重命名列
names(state_income_wide) <- gsub("pctui.1", "low_income_uninsured", names(state_income_wide))
names(state_income_wide) <- gsub("pctui.4", "high_income_uninsured", names(state_income_wide))
# 4.3 race:固定 age/sex=0 & iprcat=0,只让 racecat 变化(white=1, black=2)
state_race_raw <- subset(
df_state_raw,
agecat == 0 & sexcat == 0 & iprcat == 0 & racecat %in% c(1,2),
select = c("name", "racecat", "pctui")
)
state_race_wide <- reshape(
state_race_raw,
idvar = "name",
timevar = "racecat",
direction = "wide"
)
names(state_race_wide) <- gsub("pctui.1", "white_uninsured", names(state_race_wide))
names(state_race_wide) <- gsub("pctui.2", "black_uninsured", names(state_race_wide))
df_state <- merge(state_overall, state_income_wide, by = "name", all.x = TRUE)
df_state <- merge(df_state, state_race_wide, by = "name", all.x = TRUE)
clean_percent <- function(x){
as.numeric(gsub("%", "", gsub(",", "", x)))
}
df_state$overall_uninsured <- clean_percent(df_state$overall_uninsured)
df_state$low_income_uninsured <- clean_percent(df_state$low_income_uninsured)
df_state$high_income_uninsured <- clean_percent(df_state$high_income_uninsured)
df_state$white_uninsured <- clean_percent(df_state$white_uninsured)
df_state$black_uninsured <- clean_percent(df_state$black_uninsured)
df_state$income_gap <- df_state$low_income_uninsured - df_state$high_income_uninsured
df_state$race_gap <- df_state$black_uninsured - df_state$white_uninsured
df_state <- na.omit(df_state)
names(df_state)[1] <- "state"
county_overall <- subset(
df_county_raw,
agecat == 0 & racecat == 0 & sexcat == 0 & iprcat == 0,
select = c("stcou", "name", "pctui")
)
names(county_overall)[3] <- "overall_uninsured"
county_income_raw <- subset(
df_county_raw,
agecat == 0 & racecat == 0 & sexcat == 0 & iprcat %in% c(1,4),
select = c("stcou", "name", "iprcat", "pctui")
)
county_income_wide <- reshape(
county_income_raw,
idvar = c("stcou", "name"),
timevar = "iprcat",
direction = "wide"
)
names(county_income_wide) <- gsub("pctui.1", "low_income_uninsured", names(county_income_wide))
names(county_income_wide) <- gsub("pctui.4", "high_income_uninsured", names(county_income_wide))
# 合并
df_county <- merge(county_overall, county_income_wide,
by = c("stcou", "name"), all.x = TRUE)
# 数值转换
df_county$overall_uninsured <- clean_percent(df_county$overall_uninsured)
df_county$low_income_uninsured <- clean_percent(df_county$low_income_uninsured)
df_county$high_income_uninsured <- clean_percent(df_county$high_income_uninsured)
df_county$income_gap <- df_county$low_income_uninsured - df_county$high_income_uninsured
df_county <- na.omit(df_county)
names(df_county)[2] <- "county"
# State
X_state <- df_state[, !(names(df_state) %in% c("state"))]
X_state_scaled <- scale(X_state)
# County
X_county <- df_county[, !(names(df_county) %in% c("stcou", "county"))]
X_county_scaled <- scale(X_county)
# 检查维度
dim(df_state)
## [1] 51 8
dim(X_state_scaled)
## [1] 51 7
dim(df_county)
## [1] 3142 6
# ---- (A) Interpretation PCA (REPORT) ----
pca.f <- principal(X_county_scaled, nfactors = 4, rotate = "varimax", scores = TRUE)
pca.f
## Principal Components Analysis
## Call: principal(r = X_county_scaled, nfactors = 4, rotate = "varimax",
## scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC2 RC3 RC4 h2 u2 com
## overall_uninsured 0.98 -0.15 -0.14 0 1 -4.4e-16 1.1
## low_income_uninsured 0.89 0.45 0.09 0 1 6.7e-16 1.5
## high_income_uninsured 0.99 0.10 0.10 0 1 3.3e-16 1.0
## income_gap 0.05 1.00 0.02 0 1 -2.0e-15 1.0
##
## RC1 RC2 RC3 RC4
## SS loadings 2.73 1.23 0.04 0
## Proportion Var 0.68 0.31 0.01 0
## Cumulative Var 0.68 0.99 1.00 1
## Proportion Explained 0.68 0.31 0.01 0
## Cumulative Proportion 0.68 0.99 1.00 1
##
## Mean item complexity = 1.2
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0
## with the empirical chi square 0 with prob < NA
##
## Fit based upon off diagonal values = 1
print(loadings(pca.f), digits = 3, cutoff = 0.4, sort = TRUE)
##
## Loadings:
## RC1 RC2 RC3 RC4
## overall_uninsured 0.980
## low_income_uninsured 0.888 0.450
## high_income_uninsured 0.990
## income_gap 0.999
##
## RC1 RC2 RC3 RC4
## SS loadings 2.733 1.231 0.036 0
## Proportion Var 0.683 0.308 0.009 0
## Cumulative Var 0.683 0.991 1.000 1
# rotated loadings & PC scores
pc_loadings_rot <- as.data.frame(unclass(pca.f$loadings))
pc_scores <- as.data.frame(pca.f$scores)
#
pca.vis <- prcomp(X_county_scaled, center = FALSE, scale. = FALSE)
# Scree plot(eigenvalue/variance)
fviz_eig(pca.vis, addlabels = TRUE)
# Loading / Variable plot
fviz_pca_var(pca.vis, repel = TRUE)
# Individuals plot
fviz_pca_ind(pca.vis, geom = "point", repel = TRUE)
# Biplot
fviz_pca_biplot(pca.vis, repel = TRUE)
>Interpretation
Principal Component Analysis (PCA) was conducted on the standardized county-level insurance variables. The scree plot shows that the first principal component (PC1) explains 70.9% of the total variance, while the second component (PC2) explains 28.4%. Together, the first two components account for 99.1% of the total variance, indicating that almost all information in the dataset can be captured by just two dimensions. The remaining components contribute negligibly (PC3 = 0.7%, PC4 = 0%), suggesting that a two-dimensional representation is sufficient.
The rotated loading matrix provides a clear interpretation of these dimensions. PC1 has very high loadings on overall_uninsured (0.98), high_income_uninsured (0.99), and low_income_uninsured (0.89), indicating that this component represents a general insurance coverage intensity dimension. Counties scoring high on PC1 have systematically higher uninsured rates across income groups.
PC2, on the other hand, loads almost entirely on income_gap (0.999), suggesting that this dimension captures income inequality structure rather than overall insurance levels. The variable plot confirms this interpretation: uninsured-related variables align strongly along Dim1 (70.9%), while income_gap is primarily aligned with Dim2 (28.4%).
mds_n <- min(2000, nrow(pc_scores))
mds_id <- sample(seq_len(nrow(pc_scores)), mds_n)
D_mds <- dist(pc_scores[mds_id, , drop = FALSE])
mds_xy <- cmdscale(D_mds, k = 2)
mds_df <- data.frame(
Dim1 = mds_xy[, 1],
Dim2 = mds_xy[, 2]
)
dim(X_county_scaled)
## [1] 3142 4
Dimension 1 – Socioeconomic Advantage
Dimension 2 – Insurance Structure & Vulnerability
MDS Interpretation
# 确保是 matrix
X_county_scaled <- as.matrix(X_county_scaled)
set.seed(123)
fviz_nbclust(X_county_scaled, FUNcluster = pam, method = "silhouette", k.max = 10) +
theme_minimal() +
labs(title = "Choosing k (County, PAM) using silhouette")
best_k <- 2
pam_res <- pam(X_county_scaled, k = best_k)
county_pam <- eclust(
X_county_scaled,
FUNcluster = "pam",
k = best_k,
graph = FALSE
)
# label
county_cluster <- county_pam$cluster
table(county_cluster)
## county_cluster
## 1 2
## 1670 1472
# Cluster profiling:Original standardized variable
prof_raw <- describeBy(as.data.frame(X_county_scaled), group = county_cluster, mat = TRUE)
prof_raw
## item group1 vars n mean sd median
## overall_uninsured1 1 1 1 1670 -0.6899227 0.5784077 -0.64919353
## overall_uninsured2 2 2 1 1472 0.7827249 0.7760368 0.67223388
## low_income_uninsured1 3 1 2 1670 -0.7185475 0.5608215 -0.65711770
## low_income_uninsured2 4 2 2 1472 0.8152000 0.7261304 0.64258340
## high_income_uninsured1 5 1 3 1670 -0.7404940 0.5056826 -0.68159838
## high_income_uninsured2 6 2 3 1472 0.8400985 0.7186037 0.66137049
## income_gap1 7 1 4 1670 -0.1849066 0.9276160 -0.28057189
## income_gap2 8 2 4 1472 0.2097785 1.0373700 0.08594596
## trimmed mad min max range
## overall_uninsured1 -0.6800960 0.6618744 -2.6670489 0.8686623 3.535711
## overall_uninsured2 0.7276243 0.6353994 -1.8456210 4.0829452 5.928566
## low_income_uninsured1 -0.6995353 0.5693223 -3.0940573 0.9084314 4.002489
## low_income_uninsured2 0.7423955 0.6569103 -0.6423484 4.5712254 5.213574
## high_income_uninsured1 -0.7177108 0.5475486 -2.7632001 0.3088412 3.072041
## high_income_uninsured2 0.7531866 0.6471028 -0.4465788 4.9420838 5.388663
## income_gap1 -0.2284473 0.7849102 -3.1312662 5.0950232 8.226289
## income_gap2 0.1440246 0.9056656 -3.6606809 5.4208168 9.081498
## skew kurtosis se
## overall_uninsured1 -0.1993882 -0.3857059 0.01415389
## overall_uninsured2 0.7520742 1.5407543 0.02022686
## low_income_uninsured1 -0.4306136 0.5430498 0.01372355
## low_income_uninsured2 0.9360990 0.6887952 0.01892608
## high_income_uninsured1 -0.4827445 -0.1417078 0.01237428
## high_income_uninsured2 1.1617190 1.4654703 0.01872990
## income_gap1 0.5793741 1.6585375 0.02269917
## income_gap2 0.7217728 1.8883646 0.02703833
# Cluster profiling:PC
prof_pc <- describeBy(pc_scores, group = county_cluster, mat = TRUE)
prof_pc
## item group1 vars n mean sd median
## RC11 1 1 1 1670 -2.056500e+00 1.420925e+00 -1.833936e+00
## RC12 2 2 1 1472 2.333121e+00 1.937890e+00 1.809761e+00
## RC21 3 1 2 1670 -4.816668e-01 1.121789e+00 -5.709203e-01
## RC22 4 2 2 1472 5.464562e-01 1.268645e+00 4.014477e-01
## RC31 5 1 3 1670 -4.406415e-02 8.091444e-02 -5.397956e-02
## RC32 6 2 3 1472 4.999126e-02 1.052532e-01 3.269777e-02
## RC41 7 1 4 1670 -8.611639e-09 8.613196e-09 -8.628185e-09
## RC42 8 2 4 1472 9.769999e-09 1.047807e-08 8.043680e-09
## trimmed mad min max range
## RC11 -1.971552e+00 1.592092e+00 -7.992411e+00 3.570672e-01 8.349478e+00
## RC12 2.087008e+00 1.722893e+00 -6.517869e-01 1.295062e+01 1.360241e+01
## RC21 -5.297753e-01 1.001889e+00 -4.138088e+00 5.519699e+00 9.657787e+00
## RC22 4.581844e-01 1.107048e+00 -4.110167e+00 6.872363e+00 1.098253e+01
## RC31 -4.929714e-02 7.083578e-02 -2.950622e-01 3.431489e-01 6.382111e-01
## RC32 3.986948e-02 9.078509e-02 -2.819518e-01 5.431796e-01 8.251314e-01
## RC41 -8.638285e-09 7.980299e-09 -4.058805e-08 2.397939e-08 6.456745e-08
## RC42 8.931764e-09 9.776165e-09 -2.029838e-08 5.362453e-08 7.392291e-08
## skew kurtosis se
## RC11 -0.61941688 0.1145083 3.477065e-02
## RC12 1.18247759 1.4094002 5.050975e-02
## RC21 0.53955870 1.5180780 2.745066e-02
## RC22 0.76161454 1.6211093 3.306635e-02
## RC31 0.82419500 2.0143917 1.980012e-03
## RC32 1.08307668 1.9426601 2.743350e-03
## RC41 0.02163591 0.8354717 2.107690e-10
## RC42 0.78125839 0.6310689 2.731040e-10
Based on the model results, the counties can be grouped into several clusters with significantly different socio-economic and insurance characteristics. Cluster 1 – High Insurance Coverage & Higher Income Areas
Counties in this cluster are characterized by low uninsured rates、higher median income levels、larger proportion of working-age insured population、possibly urban or economically developed regions.
From a real-world perspective, this cluster likely represents economically stable counties where access to employer-sponsored or private health insurance is more common. Higher income levels generally correlate with better access to health benefits.
Cluster 2 – Moderate Coverage & Mixed Socioeconomic Structure
Cluster 2 – High Uninsured & Low-Income Areas
Counties in this cluster exhibit Higher uninsured rates、Lower median income、Greater reliance on public insurance programs.
In practical terms, these counties may reflect economically disadvantaged regions with structural barriers to healthcare access. These areas could be priority targets for policy intervention.
-I visualized the clustering results using MDS and PCA to better understand the separation between groups. I also generated a silhouette plot to evaluate cluster quality and assess how well each county fits within its assigned cluster. These visualizations help me interpret the clustering structure and validate the choice of k.
mds_cluster <- as.factor(county_cluster[mds_id])
fviz_cluster(
list(data = mds_df, cluster = mds_cluster),
geom = "point",
ellipse.type = "norm",
show.clust.cent = TRUE
) +
theme_minimal() +
labs(title = "MDS (sampled) colored by County clusters")
# Silhouette 图
fviz_silhouette(county_pam) +
theme_minimal() +
labs(title = "Silhouette plot (County, PAM)")
## cluster size ave.sil.width
## 1 1 1670 0.45
## 2 2 1472 0.32
# 聚类在 PCA 空间可视化
fviz_cluster(
county_pam,
data = X_county_scaled,
geom = "point",
ellipse.type = "norm"
) +
theme_minimal() +
labs(title = "County clusters (PAM) visualized in PCA space")
county_km <- eclust(
X_county_scaled,
FUNcluster = "kmeans",
k = best_k,
graph = FALSE
)
fviz_cluster(county_km, data = X_county_scaled, geom = "point") +
theme_minimal() +
labs(title = "County clusters (kmeans) in PCA space")
fviz_silhouette(county_km) +
theme_minimal() +
labs(title = "Silhouette plot (County, kmeans)")
## cluster size ave.sil.width
## 1 1 1307 0.33
## 2 2 1835 0.44
Given the strong dominance of the uninsured dimension (70.9% variance explained), both K-means and PAM capture the same underlying structure. However, due to the socioeconomic nature of the data and the presence of potential extreme counties, PAM offers slightly greater robustness and interpretability, while K-means provides cleaner geometric separation.
Overall, the clustering analysis confirms that insurance disparities across counties are primarily structured along two dominant axes: overall insurance vulnerability and income inequality.
I prepared the county-level dataset for association rule mining by converting the continuous, standardized features into a transaction-style representation that is required by the Apriori algorithm. First, I selected the county feature matrix (X_county_scaled) and converted it into a data frame (X_rule) to facilitate column-wise processing.Since association rules work with categorical “items” rather than continuous numeric values, I discretized each variable into three ordinal categories—Low, Medium, and High—using quantile-based cut points (0%, 33.3%, 66.7%, and 100%). This approach ensures that each category contains roughly a similar proportion of observations and reduces sensitivity to outliers.
After discretization, I created interpretable item labels by concatenating the original variable name with its category level (e.g., overall_uninsured_High or income_gap_Low).This step makes the resulting rules easy to read and directly connects each item to a meaningful feature. Next, I transformed the data into a basket-list structure, where each county is treated as one “transaction” containing a set of items (one item per variable). Finally, I converted this basket-list into a transactions object (trans), which is the standard input format for the arules package. I then used summary(trans) to inspect the basic characteristics of the transaction data (number of transactions, item frequencies, and sparsity) before running Apriori to extract rules with support, confidence, and lift.
set.seed(123)
# eigenvalue
X_rule <- as.data.frame(X_county_scaled)
# 9.2 Numerical discretization
disc3 <- function(x){
q <- quantile(x, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE, type = 7)
cut(x, breaks = q, include.lowest = TRUE, labels = c("Low","Med","High"))
}
X_cat <- as.data.frame(lapply(X_rule, disc3))
# Combine the variable names and categories to form "item"
for(j in seq_along(X_cat)){
nm <- names(X_cat)[j]
X_cat[[j]] <- paste(nm, X_cat[[j]], sep = "_")
}
# transactions
basket_list <- lapply(seq_len(nrow(X_cat)), function(i){
unlist(X_cat[i, , drop = FALSE], use.names = FALSE)
})
trans <- as(basket_list, "transactions")
summary(trans)
## transactions as itemMatrix in sparse format with
## 3142 rows (elements/itemsets/transactions) and
## 12 columns (items) and a density of 0.3333333
##
## most frequent items:
## low_income_uninsured_Low overall_uninsured_Low overall_uninsured_Med
## 1062 1058 1056
## high_income_uninsured_Low income_gap_Med (Other)
## 1055 1055 7282
##
## element (itemset/transaction) length distribution:
## sizes
## 4
## 3142
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4 4 4 4 4 4
##
## includes extended item information - examples:
## labels
## 1 high_income_uninsured_High
## 2 high_income_uninsured_Low
## 3 high_income_uninsured_Med
-I applied the Apriori algorithm to the transaction dataset in order to extract association rules from the county-level features. I set a minimum support threshold of 0.05, meaning that a rule must appear in at least 5% of the observations to be considered. I also required a minimum confidence of 0.60, ensuring that the rule has at least a 60% conditional probability of occurring.
# 9.3 Apriori
rules_all <- apriori(
trans,
parameter = list(
supp = 0.05,
conf = 0.60,
minlen = 2,
maxlen = 5
)
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.05 2
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 157
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[12 item(s), 3142 transaction(s)] done [0.00s].
## sorting and recoding items ... [12 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [86 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules_all <- sort(rules_all, by = "lift", decreasing = TRUE)
inspect(head(rules_all, 20))
## lhs rhs support confidence coverage lift count
## [1] {income_gap_High,
## overall_uninsured_High} => {low_income_uninsured_High} 0.10057288 1.0000000 0.10057288 3.044574 316
## [2] {high_income_uninsured_High,
## income_gap_High} => {low_income_uninsured_High} 0.13494589 1.0000000 0.13494589 3.044574 424
## [3] {high_income_uninsured_High,
## income_gap_High,
## overall_uninsured_High} => {low_income_uninsured_High} 0.09898154 1.0000000 0.09898154 3.044574 311
## [4] {income_gap_Low,
## low_income_uninsured_High} => {overall_uninsured_High} 0.05028644 0.9937107 0.05060471 3.037197 158
## [5] {high_income_uninsured_High,
## income_gap_Low,
## low_income_uninsured_High} => {overall_uninsured_High} 0.05028644 0.9937107 0.05060471 3.037197 158
## [6] {income_gap_Low,
## low_income_uninsured_High} => {high_income_uninsured_High} 0.05060471 1.0000000 0.05060471 3.032819 159
## [7] {income_gap_Low,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.05028644 1.0000000 0.05028644 3.032819 158
## [8] {low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.23711012 0.9880637 0.23997454 2.996618 745
## [9] {income_gap_Med,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.08784214 0.9857143 0.08911521 2.989493 276
## [10] {income_gap_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.09898154 0.9841772 0.10057288 2.984831 311
## [11] {income_gap_High,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.09898154 0.9841772 0.10057288 2.984831 311
## [12] {income_gap_Med,
## low_income_uninsured_High} => {high_income_uninsured_High} 0.09452578 0.9801980 0.09643539 2.972763 297
## [13] {high_income_uninsured_High,
## income_gap_Low} => {overall_uninsured_High} 0.08561426 0.9711191 0.08816041 2.968148 269
## [14] {high_income_uninsured_Low,
## income_gap_Low} => {low_income_uninsured_Low} 0.13749204 1.0000000 0.13749204 2.958569 432
## [15] {high_income_uninsured_Low,
## income_gap_Low,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.10661999 1.0000000 0.10661999 2.958569 335
## [16] {income_gap_Med,
## low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.09388924 0.9932660 0.09452578 2.958144 295
## [17] {income_gap_Low,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.10948440 0.9971014 0.10980267 2.949993 344
## [18] {high_income_uninsured_Low,
## income_gap_High} => {overall_uninsured_Low} 0.08752387 0.9927798 0.08816041 2.948312 275
## [19] {low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.24283896 0.9857881 0.24633991 2.935873 763
## [20] {income_gap_Low,
## low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.10661999 0.9738372 0.10948440 2.900281 335
# filtering
rules_strong <- subset(rules_all, subset = lift > 1.2)
inspect(head(rules_strong, 20))
## lhs rhs support confidence coverage lift count
## [1] {income_gap_High,
## overall_uninsured_High} => {low_income_uninsured_High} 0.10057288 1.0000000 0.10057288 3.044574 316
## [2] {high_income_uninsured_High,
## income_gap_High} => {low_income_uninsured_High} 0.13494589 1.0000000 0.13494589 3.044574 424
## [3] {high_income_uninsured_High,
## income_gap_High,
## overall_uninsured_High} => {low_income_uninsured_High} 0.09898154 1.0000000 0.09898154 3.044574 311
## [4] {income_gap_Low,
## low_income_uninsured_High} => {overall_uninsured_High} 0.05028644 0.9937107 0.05060471 3.037197 158
## [5] {high_income_uninsured_High,
## income_gap_Low,
## low_income_uninsured_High} => {overall_uninsured_High} 0.05028644 0.9937107 0.05060471 3.037197 158
## [6] {income_gap_Low,
## low_income_uninsured_High} => {high_income_uninsured_High} 0.05060471 1.0000000 0.05060471 3.032819 159
## [7] {income_gap_Low,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.05028644 1.0000000 0.05028644 3.032819 158
## [8] {low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.23711012 0.9880637 0.23997454 2.996618 745
## [9] {income_gap_Med,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.08784214 0.9857143 0.08911521 2.989493 276
## [10] {income_gap_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.09898154 0.9841772 0.10057288 2.984831 311
## [11] {income_gap_High,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.09898154 0.9841772 0.10057288 2.984831 311
## [12] {income_gap_Med,
## low_income_uninsured_High} => {high_income_uninsured_High} 0.09452578 0.9801980 0.09643539 2.972763 297
## [13] {high_income_uninsured_High,
## income_gap_Low} => {overall_uninsured_High} 0.08561426 0.9711191 0.08816041 2.968148 269
## [14] {high_income_uninsured_Low,
## income_gap_Low} => {low_income_uninsured_Low} 0.13749204 1.0000000 0.13749204 2.958569 432
## [15] {high_income_uninsured_Low,
## income_gap_Low,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.10661999 1.0000000 0.10661999 2.958569 335
## [16] {income_gap_Med,
## low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.09388924 0.9932660 0.09452578 2.958144 295
## [17] {income_gap_Low,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.10948440 0.9971014 0.10980267 2.949993 344
## [18] {high_income_uninsured_Low,
## income_gap_High} => {overall_uninsured_Low} 0.08752387 0.9927798 0.08816041 2.948312 275
## [19] {low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.24283896 0.9857881 0.24633991 2.935873 763
## [20] {income_gap_Low,
## low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.10661999 0.9738372 0.10948440 2.900281 335
# visualization
topN <- min(30, length(rules_strong))
rules_top <- head(rules_strong, topN)
inspect(rules_top)
## lhs rhs support confidence coverage lift count
## [1] {income_gap_High,
## overall_uninsured_High} => {low_income_uninsured_High} 0.10057288 1.0000000 0.10057288 3.044574 316
## [2] {high_income_uninsured_High,
## income_gap_High} => {low_income_uninsured_High} 0.13494589 1.0000000 0.13494589 3.044574 424
## [3] {high_income_uninsured_High,
## income_gap_High,
## overall_uninsured_High} => {low_income_uninsured_High} 0.09898154 1.0000000 0.09898154 3.044574 311
## [4] {income_gap_Low,
## low_income_uninsured_High} => {overall_uninsured_High} 0.05028644 0.9937107 0.05060471 3.037197 158
## [5] {high_income_uninsured_High,
## income_gap_Low,
## low_income_uninsured_High} => {overall_uninsured_High} 0.05028644 0.9937107 0.05060471 3.037197 158
## [6] {income_gap_Low,
## low_income_uninsured_High} => {high_income_uninsured_High} 0.05060471 1.0000000 0.05060471 3.032819 159
## [7] {income_gap_Low,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.05028644 1.0000000 0.05028644 3.032819 158
## [8] {low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.23711012 0.9880637 0.23997454 2.996618 745
## [9] {income_gap_Med,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.08784214 0.9857143 0.08911521 2.989493 276
## [10] {income_gap_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.09898154 0.9841772 0.10057288 2.984831 311
## [11] {income_gap_High,
## low_income_uninsured_High,
## overall_uninsured_High} => {high_income_uninsured_High} 0.09898154 0.9841772 0.10057288 2.984831 311
## [12] {income_gap_Med,
## low_income_uninsured_High} => {high_income_uninsured_High} 0.09452578 0.9801980 0.09643539 2.972763 297
## [13] {high_income_uninsured_High,
## income_gap_Low} => {overall_uninsured_High} 0.08561426 0.9711191 0.08816041 2.968148 269
## [14] {high_income_uninsured_Low,
## income_gap_Low} => {low_income_uninsured_Low} 0.13749204 1.0000000 0.13749204 2.958569 432
## [15] {high_income_uninsured_Low,
## income_gap_Low,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.10661999 1.0000000 0.10661999 2.958569 335
## [16] {income_gap_Med,
## low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.09388924 0.9932660 0.09452578 2.958144 295
## [17] {income_gap_Low,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.10948440 0.9971014 0.10980267 2.949993 344
## [18] {high_income_uninsured_Low,
## income_gap_High} => {overall_uninsured_Low} 0.08752387 0.9927798 0.08816041 2.948312 275
## [19] {low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.24283896 0.9857881 0.24633991 2.935873 763
## [20] {income_gap_Low,
## low_income_uninsured_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.10661999 0.9738372 0.10948440 2.900281 335
## [21] {income_gap_Low,
## overall_uninsured_Low} => {high_income_uninsured_Low} 0.10661999 0.9710145 0.10980267 2.891874 335
## [22] {high_income_uninsured_Low,
## income_gap_Med,
## overall_uninsured_Low} => {low_income_uninsured_Low} 0.09388924 0.9703947 0.09675366 2.870980 295
## [23] {low_income_uninsured_Med,
## overall_uninsured_Med} => {high_income_uninsured_Med} 0.17441120 0.9547038 0.18268619 2.854119 548
## [24] {high_income_uninsured_High,
## income_gap_Med,
## low_income_uninsured_High} => {overall_uninsured_High} 0.08784214 0.9292929 0.09452578 2.840310 276
## [25] {income_gap_Med,
## low_income_uninsured_High} => {overall_uninsured_High} 0.08911521 0.9240924 0.09643539 2.824415 280
## [26] {income_gap_Med,
## low_income_uninsured_Med,
## overall_uninsured_Med} => {high_income_uninsured_Med} 0.09007002 0.9433333 0.09548059 2.820127 283
## [27] {high_income_uninsured_High,
## income_gap_Med} => {overall_uninsured_High} 0.09834500 0.9223881 0.10661999 2.819206 309
## [28] {income_gap_Med,
## low_income_uninsured_Low} => {high_income_uninsured_Low} 0.10343730 0.9365994 0.11043921 2.789380 325
## [29] {high_income_uninsured_Med,
## income_gap_Med,
## overall_uninsured_Med} => {low_income_uninsured_Med} 0.09007002 0.9278689 0.09707193 2.781836 283
## [30] {high_income_uninsured_Low,
## income_gap_Med} => {low_income_uninsured_Low} 0.10343730 0.9393064 0.11012094 2.779002 325
-I converted the top association rules into a data frame format to facilitate interpretation and reporting. Since the rules are originally stored as combined strings in the form {LHS} => {RHS}, I split each rule into its left-hand side (LHS) and right-hand side (RHS) components and removed the curly brackets for clarity.I then constructed a clean summary table containing the LHS, RHS, support, confidence, and lift values for each rule. Finally, I sorted the table in descending order of lift to prioritize the strongest and most meaningful associations, and printed the top results for interpretation.
rules_top_df <- as(rules_top, "data.frame")
lhs_rhs <- strsplit(as.character(rules_top_df$rules), "=>", fixed = TRUE)
lhs <- trimws(gsub("^\\{|\\}$", "", sapply(lhs_rhs, `[`, 1)))
rhs <- trimws(gsub("^\\{|\\}$", "", sapply(lhs_rhs, `[`, 2)))
rules_top_table <- data.frame(
LHS = lhs,
RHS = rhs,
support = rules_top_df$support,
confidence = rules_top_df$confidence,
lift = rules_top_df$lift,
stringsAsFactors = FALSE
)
rules_top_table <- rules_top_table[order(-rules_top_table$lift), ]
print(head(rules_top_table, 30))
## LHS
## 1 income_gap_High,overall_uninsured_High}
## 2 high_income_uninsured_High,income_gap_High}
## 3 high_income_uninsured_High,income_gap_High,overall_uninsured_High}
## 4 income_gap_Low,low_income_uninsured_High}
## 5 high_income_uninsured_High,income_gap_Low,low_income_uninsured_High}
## 6 income_gap_Low,low_income_uninsured_High}
## 7 income_gap_Low,low_income_uninsured_High,overall_uninsured_High}
## 8 low_income_uninsured_High,overall_uninsured_High}
## 9 income_gap_Med,low_income_uninsured_High,overall_uninsured_High}
## 10 income_gap_High,overall_uninsured_High}
## 11 income_gap_High,low_income_uninsured_High,overall_uninsured_High}
## 12 income_gap_Med,low_income_uninsured_High}
## 13 high_income_uninsured_High,income_gap_Low}
## 14 high_income_uninsured_Low,income_gap_Low}
## 15 high_income_uninsured_Low,income_gap_Low,overall_uninsured_Low}
## 16 income_gap_Med,low_income_uninsured_Low,overall_uninsured_Low}
## 17 income_gap_Low,overall_uninsured_Low}
## 18 high_income_uninsured_Low,income_gap_High}
## 19 low_income_uninsured_Low,overall_uninsured_Low}
## 20 income_gap_Low,low_income_uninsured_Low,overall_uninsured_Low}
## 21 income_gap_Low,overall_uninsured_Low}
## 22 high_income_uninsured_Low,income_gap_Med,overall_uninsured_Low}
## 23 low_income_uninsured_Med,overall_uninsured_Med}
## 24 high_income_uninsured_High,income_gap_Med,low_income_uninsured_High}
## 25 income_gap_Med,low_income_uninsured_High}
## 26 income_gap_Med,low_income_uninsured_Med,overall_uninsured_Med}
## 27 high_income_uninsured_High,income_gap_Med}
## 28 income_gap_Med,low_income_uninsured_Low}
## 29 high_income_uninsured_Med,income_gap_Med,overall_uninsured_Med}
## 30 high_income_uninsured_Low,income_gap_Med}
## RHS support confidence lift
## 1 {low_income_uninsured_High 0.10057288 1.0000000 3.044574
## 2 {low_income_uninsured_High 0.13494589 1.0000000 3.044574
## 3 {low_income_uninsured_High 0.09898154 1.0000000 3.044574
## 4 {overall_uninsured_High 0.05028644 0.9937107 3.037197
## 5 {overall_uninsured_High 0.05028644 0.9937107 3.037197
## 6 {high_income_uninsured_High 0.05060471 1.0000000 3.032819
## 7 {high_income_uninsured_High 0.05028644 1.0000000 3.032819
## 8 {high_income_uninsured_High 0.23711012 0.9880637 2.996618
## 9 {high_income_uninsured_High 0.08784214 0.9857143 2.989493
## 10 {high_income_uninsured_High 0.09898154 0.9841772 2.984831
## 11 {high_income_uninsured_High 0.09898154 0.9841772 2.984831
## 12 {high_income_uninsured_High 0.09452578 0.9801980 2.972763
## 13 {overall_uninsured_High 0.08561426 0.9711191 2.968148
## 14 {low_income_uninsured_Low 0.13749204 1.0000000 2.958569
## 15 {low_income_uninsured_Low 0.10661999 1.0000000 2.958569
## 16 {high_income_uninsured_Low 0.09388924 0.9932660 2.958144
## 17 {low_income_uninsured_Low 0.10948440 0.9971014 2.949993
## 18 {overall_uninsured_Low 0.08752387 0.9927798 2.948312
## 19 {high_income_uninsured_Low 0.24283896 0.9857881 2.935873
## 20 {high_income_uninsured_Low 0.10661999 0.9738372 2.900281
## 21 {high_income_uninsured_Low 0.10661999 0.9710145 2.891874
## 22 {low_income_uninsured_Low 0.09388924 0.9703947 2.870980
## 23 {high_income_uninsured_Med 0.17441120 0.9547038 2.854119
## 24 {overall_uninsured_High 0.08784214 0.9292929 2.840310
## 25 {overall_uninsured_High 0.08911521 0.9240924 2.824415
## 26 {high_income_uninsured_Med 0.09007002 0.9433333 2.820127
## 27 {overall_uninsured_High 0.09834500 0.9223881 2.819206
## 28 {high_income_uninsured_Low 0.10343730 0.9365994 2.789380
## 29 {low_income_uninsured_Med 0.09007002 0.9278689 2.781836
## 30 {low_income_uninsured_Low 0.10343730 0.9393064 2.779002
# outpot
write.csv(rules_top_table, "association_rules_top_metrics.csv", row.names = FALSE)
write.csv(as(rules_top, "data.frame"), "association_rules_top.csv", row.names = FALSE)
# visualization
plot(rules_top, method = "scatterplot", measure = c("support","confidence"), shading = "lift")
plot(rules_top, method = "matrix", measure = "lift")
## Itemsets in Antecedent (LHS)
## [1] "{high_income_uninsured_High,income_gap_High}"
## [2] "{high_income_uninsured_High,income_gap_High,overall_uninsured_High}"
## [3] "{high_income_uninsured_High,income_gap_Low,low_income_uninsured_High}"
## [4] "{income_gap_Low,low_income_uninsured_High}"
## [5] "{income_gap_Low,low_income_uninsured_High,overall_uninsured_High}"
## [6] "{income_gap_High,overall_uninsured_High}"
## [7] "{low_income_uninsured_High,overall_uninsured_High}"
## [8] "{income_gap_Med,low_income_uninsured_High,overall_uninsured_High}"
## [9] "{income_gap_High,low_income_uninsured_High,overall_uninsured_High}"
## [10] "{high_income_uninsured_High,income_gap_Low}"
## [11] "{high_income_uninsured_Low,income_gap_Low}"
## [12] "{high_income_uninsured_Low,income_gap_Low,overall_uninsured_Low}"
## [13] "{income_gap_Med,low_income_uninsured_Low,overall_uninsured_Low}"
## [14] "{high_income_uninsured_Low,income_gap_High}"
## [15] "{low_income_uninsured_Low,overall_uninsured_Low}"
## [16] "{income_gap_Low,overall_uninsured_Low}"
## [17] "{income_gap_Low,low_income_uninsured_Low,overall_uninsured_Low}"
## [18] "{income_gap_Med,low_income_uninsured_High}"
## [19] "{high_income_uninsured_Low,income_gap_Med,overall_uninsured_Low}"
## [20] "{low_income_uninsured_Med,overall_uninsured_Med}"
## [21] "{high_income_uninsured_High,income_gap_Med,low_income_uninsured_High}"
## [22] "{income_gap_Med,low_income_uninsured_Med,overall_uninsured_Med}"
## [23] "{high_income_uninsured_High,income_gap_Med}"
## [24] "{income_gap_Med,low_income_uninsured_Low}"
## [25] "{high_income_uninsured_Med,income_gap_Med,overall_uninsured_Med}"
## [26] "{high_income_uninsured_Low,income_gap_Med}"
## Itemsets in Consequent (RHS)
## [1] "{low_income_uninsured_Med}" "{high_income_uninsured_Med}"
## [3] "{high_income_uninsured_Low}" "{low_income_uninsured_Low}"
## [5] "{overall_uninsured_High}" "{overall_uninsured_Low}"
## [7] "{high_income_uninsured_High}" "{low_income_uninsured_High}"
plot(rules_top, method = "graph", engine = "htmlwidget")
The key rules suggest strong relationships such as counties with high poverty rates → high uninsured rates;low-income category + working-age population → higher uninsured probability、high median income → low uninsured rate.The association rules reinforce the clustering and PCA findings.
Socioeconomic disadvantage is strongly linked to lack of insurance coverage
Certain demographic groups systematically experience higher insurance gaps
Insurance inequality is structurally patterned rather than random
This confirms that health insurance disparities are driven by underlying socioeconomic structures.
Health insurance disparities in the United States are strongly shaped by race, age structure, and income differences. Minority groups—particularly Hispanic and Black populations—tend to experience higher uninsured rates, largely due to structural income inequality and employment instability. Age also plays a key role: children and the elderly typically have higher coverage rates due to public programs (Medicaid and Medicare), while working-age adults face the greatest risk of being uninsured. Among all factors, income is the most decisive driver, as higher-income individuals are more likely to access employer-sponsored or private insurance.
Based on these patterns, the population can be segmented into three profiles: (1) economically stable, insured individuals with higher income and stable employment; (2) transitional groups with moderate income and conditional insurance coverage depending on policy support; and (3) economically vulnerable individuals—often younger, lower-income, and from minority communities—who face a higher probability of being uninsured. These clusters reflect structural inequality rather than random variation in insurance participation.