1 Background


2 packages

install.packages("tidyverse")
install.packages(c("arules","arulesViz"))
library(tidyverse)
library(readr)
library(psych)
library(factoextra)
library(cluster)
library(arules)
library(arulesViz)

3 Read and clean data

The dataset used in this project is derived from the 2010 Small Area Health Insurance Estimates (SAHIE) program, published by the United States Census Bureau.

The dataset provides county-level and state-level estimates of:

df_raw <- read.csv("C:/Users/ASUS-PC/Desktop/UL_Project/sahie2010.csv",  
  skip = 3,
  header = TRUE,
  sep = ",",
  stringsAsFactors = FALSE
)

df_raw$stcou <- gsub('^="|"$', "", df_raw$stcou)
df_raw$name  <- trimws(df_raw$name)

dim(df_raw)
## [1] 216790     18
names(df_raw)
##  [1] "year"      "stcou"     "geocat"    "agecat"    "racecat"   "sexcat"   
##  [7] "iprcat"    "name"      "nipr"      "nipr_moe"  "nui"       "nui_moe"  
## [13] "nic"       "nic_moe"   "pctui"     "pctui_moe" "pctic"     "pctic_moe"
str(df_raw)
## 'data.frame':    216790 obs. of  18 variables:
##  $ year     : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ stcou    : chr  "=01000" "=02000" "=04000" "=05000" ...
##  $ geocat   : int  40 40 40 40 40 40 40 40 40 40 ...
##  $ agecat   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ racecat  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sexcat   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ iprcat   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ name     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ nipr     : chr  "4021188" "647241" "5399995" "2431331" ...
##  $ nipr_moe : chr  "N/A" "N/A" "N/A" "N/A" ...
##  $ nui      : chr  "681437" "138777" "1042809" "500134" ...
##  $ nui_moe  : chr  "14309" "5722" "21121" "11712" ...
##  $ nic      : chr  "3339750" "508464" "4357186" "1931198" ...
##  $ nic_moe  : chr  "14309" "5722" "21121" "11712" ...
##  $ pctui    : chr  "16.9" "21.4" "19.3" "20.6" ...
##  $ pctui_moe: chr  "0.4" "0.9" "0.4" "0.5" ...
##  $ pctic    : chr  "83.1" "78.6" "80.7" "79.4" ...
##  $ pctic_moe: chr  "0.4" "0.9" "0.4" "0.5" ...
summary(df_raw)
##       year         stcou               geocat          agecat     
##  Min.   :2010   Length:216790      Min.   :40.00   Min.   :0.000  
##  1st Qu.:2010   Class :character   1st Qu.:50.00   1st Qu.:1.000  
##  Median :2010   Mode  :character   Median :50.00   Median :2.000  
##  Mean   :2010                      Mean   :49.42   Mean   :1.684  
##  3rd Qu.:2010                      3rd Qu.:50.00   3rd Qu.:3.000  
##  Max.   :2010                      Max.   :50.00   Max.   :4.000  
##     racecat            sexcat           iprcat      name          
##  Min.   :0.00000   Min.   :0.0000   Min.   :0   Length:216790     
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1   Class :character  
##  Median :0.00000   Median :1.0000   Median :2   Mode  :character  
##  Mean   :0.08469   Mean   :0.9263   Mean   :2                     
##  3rd Qu.:0.00000   3rd Qu.:2.0000   3rd Qu.:3                     
##  Max.   :3.00000   Max.   :2.0000   Max.   :4                     
##      nipr             nipr_moe             nui              nui_moe         
##  Length:216790      Length:216790      Length:216790      Length:216790     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      nic              nic_moe             pctui            pctui_moe        
##  Length:216790      Length:216790      Length:216790      Length:216790     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     pctic            pctic_moe        
##  Length:216790      Length:216790     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

4 Data Cleaning

# 删除包含 "moe" 的列
moe_cols <- grep("moe", names(df_raw), ignore.case = TRUE)
df_clean <- df_raw[, -moe_cols]

# 删除指定列(如果存在)
drop_cols <- c("nipr", "nic", "nui", "year")
drop_exist <- drop_cols[drop_cols %in% names(df_clean)]
df_clean <- df_clean[, !(names(df_clean) %in% drop_exist)]

# 检查分类变量
table(df_clean$geocat)
## 
##     40     50 
##  12495 204295
table(df_clean$iprcat)
## 
##     0     1     2     3     4 
## 43358 43358 43358 43358 43358
table(df_clean$racecat)
## 
##      0      1      2      3 
## 207610   3060   3060   3060
table(df_clean$agecat)
## 
##     0     1     2     3     4 
## 50205 50205 50205 50205 15970
table(df_clean$sexcat)
## 
##     0     1     2 
## 82910 66940 66940

5 Split data

df_state_raw  <- subset(df_clean, geocat == 40)
df_county_raw <- subset(df_clean, geocat == 50)

6 State-Level feature construction

statement:
- overall:All classifications are fixed at 0.

state_overall <- subset(
  df_state_raw,
  agecat == 0 & racecat == 0 & sexcat == 0 & iprcat == 0,
  select = c("name", "pctui")
)
names(state_overall)[2] <- "overall_uninsured"

state_income_raw <- subset(
  df_state_raw,
  agecat == 0 & racecat == 0 & sexcat == 0 & iprcat %in% c(1,4),
  select = c("name", "iprcat", "pctui")
)

state_income_wide <- reshape(
  state_income_raw,
  idvar = "name",
  timevar = "iprcat",
  direction = "wide"
)

# 重命名列
names(state_income_wide) <- gsub("pctui.1", "low_income_uninsured", names(state_income_wide))
names(state_income_wide) <- gsub("pctui.4", "high_income_uninsured", names(state_income_wide))

# 4.3 race:固定 age/sex=0 & iprcat=0,只让 racecat 变化(white=1, black=2)
state_race_raw <- subset(
  df_state_raw,
  agecat == 0 & sexcat == 0 & iprcat == 0 & racecat %in% c(1,2),
  select = c("name", "racecat", "pctui")
)

state_race_wide <- reshape(
  state_race_raw,
  idvar = "name",
  timevar = "racecat",
  direction = "wide"
)

names(state_race_wide) <- gsub("pctui.1", "white_uninsured", names(state_race_wide))
names(state_race_wide) <- gsub("pctui.2", "black_uninsured", names(state_race_wide))


df_state <- merge(state_overall, state_income_wide, by = "name", all.x = TRUE)
df_state <- merge(df_state, state_race_wide, by = "name", all.x = TRUE)


clean_percent <- function(x){
  as.numeric(gsub("%", "", gsub(",", "", x)))
}

df_state$overall_uninsured     <- clean_percent(df_state$overall_uninsured)
df_state$low_income_uninsured  <- clean_percent(df_state$low_income_uninsured)
df_state$high_income_uninsured <- clean_percent(df_state$high_income_uninsured)
df_state$white_uninsured       <- clean_percent(df_state$white_uninsured)
df_state$black_uninsured       <- clean_percent(df_state$black_uninsured)


df_state$income_gap <- df_state$low_income_uninsured - df_state$high_income_uninsured
df_state$race_gap   <- df_state$black_uninsured - df_state$white_uninsured


df_state <- na.omit(df_state)


names(df_state)[1] <- "state"

7 County-Level feature construction

county_overall <- subset(
  df_county_raw,
  agecat == 0 & racecat == 0 & sexcat == 0 & iprcat == 0,
  select = c("stcou", "name", "pctui")
)
names(county_overall)[3] <- "overall_uninsured"

county_income_raw <- subset(
  df_county_raw,
  agecat == 0 & racecat == 0 & sexcat == 0 & iprcat %in% c(1,4),
  select = c("stcou", "name", "iprcat", "pctui")
)

county_income_wide <- reshape(
  county_income_raw,
  idvar = c("stcou", "name"),
  timevar = "iprcat",
  direction = "wide"
)

names(county_income_wide) <- gsub("pctui.1", "low_income_uninsured", names(county_income_wide))
names(county_income_wide) <- gsub("pctui.4", "high_income_uninsured", names(county_income_wide))

# 合并
df_county <- merge(county_overall, county_income_wide, 
                   by = c("stcou", "name"), all.x = TRUE)

# 数值转换
df_county$overall_uninsured     <- clean_percent(df_county$overall_uninsured)
df_county$low_income_uninsured  <- clean_percent(df_county$low_income_uninsured)
df_county$high_income_uninsured <- clean_percent(df_county$high_income_uninsured)

df_county$income_gap <- df_county$low_income_uninsured - df_county$high_income_uninsured
df_county <- na.omit(df_county)

names(df_county)[2] <- "county"

8 Standardization

# State
X_state <- df_state[, !(names(df_state) %in% c("state"))]
X_state_scaled <- scale(X_state)

# County
X_county <- df_county[, !(names(df_county) %in% c("stcou", "county"))]
X_county_scaled <- scale(X_county)

# 检查维度
dim(df_state)
## [1] 51  8
dim(X_state_scaled)
## [1] 51  7
dim(df_county)
## [1] 3142    6

9 PCA

# ---- (A) Interpretation PCA (REPORT) ----
pca.f <- principal(X_county_scaled, nfactors = 4, rotate = "varimax", scores = TRUE)
pca.f
## Principal Components Analysis
## Call: principal(r = X_county_scaled, nfactors = 4, rotate = "varimax", 
##     scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        RC1   RC2   RC3 RC4 h2       u2 com
## overall_uninsured     0.98 -0.15 -0.14   0  1 -4.4e-16 1.1
## low_income_uninsured  0.89  0.45  0.09   0  1  6.7e-16 1.5
## high_income_uninsured 0.99  0.10  0.10   0  1  3.3e-16 1.0
## income_gap            0.05  1.00  0.02   0  1 -2.0e-15 1.0
## 
##                        RC1  RC2  RC3 RC4
## SS loadings           2.73 1.23 0.04   0
## Proportion Var        0.68 0.31 0.01   0
## Cumulative Var        0.68 0.99 1.00   1
## Proportion Explained  0.68 0.31 0.01   0
## Cumulative Proportion 0.68 0.99 1.00   1
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 4 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0 
##  with the empirical chi square  0  with prob <  NA 
## 
## Fit based upon off diagonal values = 1
print(loadings(pca.f), digits = 3, cutoff = 0.4, sort = TRUE)
## 
## Loadings:
##                       RC1    RC2    RC3    RC4   
## overall_uninsured      0.980                     
## low_income_uninsured   0.888  0.450              
## high_income_uninsured  0.990                     
## income_gap                    0.999              
## 
##                  RC1   RC2   RC3 RC4
## SS loadings    2.733 1.231 0.036   0
## Proportion Var 0.683 0.308 0.009   0
## Cumulative Var 0.683 0.991 1.000   1
# rotated loadings & PC scores
pc_loadings_rot <- as.data.frame(unclass(pca.f$loadings))
pc_scores <- as.data.frame(pca.f$scores)  

# 
pca.vis <- prcomp(X_county_scaled, center = FALSE, scale. = FALSE)

# Scree plot(eigenvalue/variance)
fviz_eig(pca.vis, addlabels = TRUE)

# Loading / Variable plot
fviz_pca_var(pca.vis, repel = TRUE)

# Individuals plot
fviz_pca_ind(pca.vis, geom = "point", repel = TRUE)

# Biplot
fviz_pca_biplot(pca.vis, repel = TRUE)

>Interpretation


10 MDS

mds_n  <- min(2000, nrow(pc_scores))     
mds_id <- sample(seq_len(nrow(pc_scores)), mds_n)

D_mds  <- dist(pc_scores[mds_id, , drop = FALSE])
mds_xy <- cmdscale(D_mds, k = 2)

mds_df <- data.frame(
  Dim1 = mds_xy[, 1],
  Dim2 = mds_xy[, 2]
)

dim(X_county_scaled)
## [1] 3142    4

11 Dimensionality Reduction Interpretation

Dimension 1 – Socioeconomic Advantage

Dimension 2 – Insurance Structure & Vulnerability

MDS Interpretation


12 Clustering

# 确保是 matrix
X_county_scaled <- as.matrix(X_county_scaled)

set.seed(123)

fviz_nbclust(X_county_scaled, FUNcluster = pam, method = "silhouette", k.max = 10) +
  theme_minimal() +
  labs(title = "Choosing k (County, PAM) using silhouette")

best_k <- 2
pam_res <- pam(X_county_scaled, k = best_k)

county_pam <- eclust(
  X_county_scaled,
  FUNcluster = "pam",
  k = best_k,
  graph = FALSE
)

# label
county_cluster <- county_pam$cluster
table(county_cluster)
## county_cluster
##    1    2 
## 1670 1472
# Cluster profiling:Original standardized variable
prof_raw <- describeBy(as.data.frame(X_county_scaled), group = county_cluster, mat = TRUE)
prof_raw
##                        item group1 vars    n       mean        sd      median
## overall_uninsured1        1      1    1 1670 -0.6899227 0.5784077 -0.64919353
## overall_uninsured2        2      2    1 1472  0.7827249 0.7760368  0.67223388
## low_income_uninsured1     3      1    2 1670 -0.7185475 0.5608215 -0.65711770
## low_income_uninsured2     4      2    2 1472  0.8152000 0.7261304  0.64258340
## high_income_uninsured1    5      1    3 1670 -0.7404940 0.5056826 -0.68159838
## high_income_uninsured2    6      2    3 1472  0.8400985 0.7186037  0.66137049
## income_gap1               7      1    4 1670 -0.1849066 0.9276160 -0.28057189
## income_gap2               8      2    4 1472  0.2097785 1.0373700  0.08594596
##                           trimmed       mad        min       max    range
## overall_uninsured1     -0.6800960 0.6618744 -2.6670489 0.8686623 3.535711
## overall_uninsured2      0.7276243 0.6353994 -1.8456210 4.0829452 5.928566
## low_income_uninsured1  -0.6995353 0.5693223 -3.0940573 0.9084314 4.002489
## low_income_uninsured2   0.7423955 0.6569103 -0.6423484 4.5712254 5.213574
## high_income_uninsured1 -0.7177108 0.5475486 -2.7632001 0.3088412 3.072041
## high_income_uninsured2  0.7531866 0.6471028 -0.4465788 4.9420838 5.388663
## income_gap1            -0.2284473 0.7849102 -3.1312662 5.0950232 8.226289
## income_gap2             0.1440246 0.9056656 -3.6606809 5.4208168 9.081498
##                              skew   kurtosis         se
## overall_uninsured1     -0.1993882 -0.3857059 0.01415389
## overall_uninsured2      0.7520742  1.5407543 0.02022686
## low_income_uninsured1  -0.4306136  0.5430498 0.01372355
## low_income_uninsured2   0.9360990  0.6887952 0.01892608
## high_income_uninsured1 -0.4827445 -0.1417078 0.01237428
## high_income_uninsured2  1.1617190  1.4654703 0.01872990
## income_gap1             0.5793741  1.6585375 0.02269917
## income_gap2             0.7217728  1.8883646 0.02703833
# Cluster profiling:PC
prof_pc <- describeBy(pc_scores, group = county_cluster, mat = TRUE)
prof_pc
##      item group1 vars    n          mean           sd        median
## RC11    1      1    1 1670 -2.056500e+00 1.420925e+00 -1.833936e+00
## RC12    2      2    1 1472  2.333121e+00 1.937890e+00  1.809761e+00
## RC21    3      1    2 1670 -4.816668e-01 1.121789e+00 -5.709203e-01
## RC22    4      2    2 1472  5.464562e-01 1.268645e+00  4.014477e-01
## RC31    5      1    3 1670 -4.406415e-02 8.091444e-02 -5.397956e-02
## RC32    6      2    3 1472  4.999126e-02 1.052532e-01  3.269777e-02
## RC41    7      1    4 1670 -8.611639e-09 8.613196e-09 -8.628185e-09
## RC42    8      2    4 1472  9.769999e-09 1.047807e-08  8.043680e-09
##            trimmed          mad           min          max        range
## RC11 -1.971552e+00 1.592092e+00 -7.992411e+00 3.570672e-01 8.349478e+00
## RC12  2.087008e+00 1.722893e+00 -6.517869e-01 1.295062e+01 1.360241e+01
## RC21 -5.297753e-01 1.001889e+00 -4.138088e+00 5.519699e+00 9.657787e+00
## RC22  4.581844e-01 1.107048e+00 -4.110167e+00 6.872363e+00 1.098253e+01
## RC31 -4.929714e-02 7.083578e-02 -2.950622e-01 3.431489e-01 6.382111e-01
## RC32  3.986948e-02 9.078509e-02 -2.819518e-01 5.431796e-01 8.251314e-01
## RC41 -8.638285e-09 7.980299e-09 -4.058805e-08 2.397939e-08 6.456745e-08
## RC42  8.931764e-09 9.776165e-09 -2.029838e-08 5.362453e-08 7.392291e-08
##             skew  kurtosis           se
## RC11 -0.61941688 0.1145083 3.477065e-02
## RC12  1.18247759 1.4094002 5.050975e-02
## RC21  0.53955870 1.5180780 2.745066e-02
## RC22  0.76161454 1.6211093 3.306635e-02
## RC31  0.82419500 2.0143917 1.980012e-03
## RC32  1.08307668 1.9426601 2.743350e-03
## RC41  0.02163591 0.8354717 2.107690e-10
## RC42  0.78125839 0.6310689 2.731040e-10

13 Cluster Analysis Interpretation

Based on the model results, the counties can be grouped into several clusters with significantly different socio-economic and insurance characteristics. Cluster 1 – High Insurance Coverage & Higher Income Areas

Cluster 2 – Moderate Coverage & Mixed Socioeconomic Structure

Cluster 2 – High Uninsured & Low-Income Areas


14 Silhouette interpretation

-I visualized the clustering results using MDS and PCA to better understand the separation between groups. I also generated a silhouette plot to evaluate cluster quality and assess how well each county fits within its assigned cluster. These visualizations help me interpret the clustering structure and validate the choice of k.

mds_cluster <- as.factor(county_cluster[mds_id])

fviz_cluster(
  list(data = mds_df, cluster = mds_cluster),
  geom = "point",
  ellipse.type = "norm",
  show.clust.cent = TRUE
) +
  theme_minimal() +
  labs(title = "MDS (sampled) colored by County clusters")

# Silhouette 图
fviz_silhouette(county_pam) +
  theme_minimal() +
  labs(title = "Silhouette plot (County, PAM)")
##   cluster size ave.sil.width
## 1       1 1670          0.45
## 2       2 1472          0.32

# 聚类在 PCA 空间可视化
fviz_cluster(
  county_pam,
  data = X_county_scaled,
  geom = "point",
  ellipse.type = "norm"
) +
  theme_minimal() +
  labs(title = "County clusters (PAM) visualized in PCA space")


15 K-means (contrast)

county_km <- eclust(
  X_county_scaled,
  FUNcluster = "kmeans",
  k = best_k,
  graph = FALSE
)

fviz_cluster(county_km, data = X_county_scaled, geom = "point") +
  theme_minimal() +
  labs(title = "County clusters (kmeans) in PCA space")

fviz_silhouette(county_km) +
  theme_minimal() +
  labs(title = "Silhouette plot (County, kmeans)")
##   cluster size ave.sil.width
## 1       1 1307          0.33
## 2       2 1835          0.44


16 Comparing K-means & PAM


17 Association Rules / Apriori

set.seed(123)

# eigenvalue
X_rule <- as.data.frame(X_county_scaled)

# 9.2 Numerical discretization
disc3 <- function(x){
  q <- quantile(x, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE, type = 7)
  cut(x, breaks = q, include.lowest = TRUE, labels = c("Low","Med","High"))
}

X_cat <- as.data.frame(lapply(X_rule, disc3))

# Combine the variable names and categories to form "item"
for(j in seq_along(X_cat)){
  nm <- names(X_cat)[j]
  X_cat[[j]] <- paste(nm, X_cat[[j]], sep = "_")
}

# transactions
basket_list <- lapply(seq_len(nrow(X_cat)), function(i){
  unlist(X_cat[i, , drop = FALSE], use.names = FALSE)
})
trans <- as(basket_list, "transactions")

summary(trans)
## transactions as itemMatrix in sparse format with
##  3142 rows (elements/itemsets/transactions) and
##  12 columns (items) and a density of 0.3333333 
## 
## most frequent items:
##  low_income_uninsured_Low     overall_uninsured_Low     overall_uninsured_Med 
##                      1062                      1058                      1056 
## high_income_uninsured_Low            income_gap_Med                   (Other) 
##                      1055                      1055                      7282 
## 
## element (itemset/transaction) length distribution:
## sizes
##    4 
## 3142 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4       4       4       4       4       4 
## 
## includes extended item information - examples:
##                       labels
## 1 high_income_uninsured_High
## 2  high_income_uninsured_Low
## 3  high_income_uninsured_Med

-I applied the Apriori algorithm to the transaction dataset in order to extract association rules from the county-level features. I set a minimum support threshold of 0.05, meaning that a rule must appear in at least 5% of the observations to be considered. I also required a minimum confidence of 0.60, ensuring that the rule has at least a 60% conditional probability of occurring.

# 9.3 Apriori
rules_all <- apriori(
  trans,
  parameter = list(
    supp   = 0.05,
    conf   = 0.60,
    minlen = 2,
    maxlen = 5
  )
)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5    0.05      2
##  maxlen target  ext
##       5  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 157 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[12 item(s), 3142 transaction(s)] done [0.00s].
## sorting and recoding items ... [12 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [86 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

rules_all <- sort(rules_all, by = "lift", decreasing = TRUE)
inspect(head(rules_all, 20))
##      lhs                              rhs                             support confidence   coverage     lift count
## [1]  {income_gap_High,                                                                                            
##       overall_uninsured_High}      => {low_income_uninsured_High}  0.10057288  1.0000000 0.10057288 3.044574   316
## [2]  {high_income_uninsured_High,                                                                                 
##       income_gap_High}             => {low_income_uninsured_High}  0.13494589  1.0000000 0.13494589 3.044574   424
## [3]  {high_income_uninsured_High,                                                                                 
##       income_gap_High,                                                                                            
##       overall_uninsured_High}      => {low_income_uninsured_High}  0.09898154  1.0000000 0.09898154 3.044574   311
## [4]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.05028644  0.9937107 0.05060471 3.037197   158
## [5]  {high_income_uninsured_High,                                                                                 
##       income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.05028644  0.9937107 0.05060471 3.037197   158
## [6]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {high_income_uninsured_High} 0.05060471  1.0000000 0.05060471 3.032819   159
## [7]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.05028644  1.0000000 0.05028644 3.032819   158
## [8]  {low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.23711012  0.9880637 0.23997454 2.996618   745
## [9]  {income_gap_Med,                                                                                             
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.08784214  0.9857143 0.08911521 2.989493   276
## [10] {income_gap_High,                                                                                            
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.09898154  0.9841772 0.10057288 2.984831   311
## [11] {income_gap_High,                                                                                            
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.09898154  0.9841772 0.10057288 2.984831   311
## [12] {income_gap_Med,                                                                                             
##       low_income_uninsured_High}   => {high_income_uninsured_High} 0.09452578  0.9801980 0.09643539 2.972763   297
## [13] {high_income_uninsured_High,                                                                                 
##       income_gap_Low}              => {overall_uninsured_High}     0.08561426  0.9711191 0.08816041 2.968148   269
## [14] {high_income_uninsured_Low,                                                                                  
##       income_gap_Low}              => {low_income_uninsured_Low}   0.13749204  1.0000000 0.13749204 2.958569   432
## [15] {high_income_uninsured_Low,                                                                                  
##       income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.10661999  1.0000000 0.10661999 2.958569   335
## [16] {income_gap_Med,                                                                                             
##       low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.09388924  0.9932660 0.09452578 2.958144   295
## [17] {income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.10948440  0.9971014 0.10980267 2.949993   344
## [18] {high_income_uninsured_Low,                                                                                  
##       income_gap_High}             => {overall_uninsured_Low}      0.08752387  0.9927798 0.08816041 2.948312   275
## [19] {low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.24283896  0.9857881 0.24633991 2.935873   763
## [20] {income_gap_Low,                                                                                             
##       low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.10661999  0.9738372 0.10948440 2.900281   335
# filtering
rules_strong <- subset(rules_all, subset = lift > 1.2)
inspect(head(rules_strong, 20))
##      lhs                              rhs                             support confidence   coverage     lift count
## [1]  {income_gap_High,                                                                                            
##       overall_uninsured_High}      => {low_income_uninsured_High}  0.10057288  1.0000000 0.10057288 3.044574   316
## [2]  {high_income_uninsured_High,                                                                                 
##       income_gap_High}             => {low_income_uninsured_High}  0.13494589  1.0000000 0.13494589 3.044574   424
## [3]  {high_income_uninsured_High,                                                                                 
##       income_gap_High,                                                                                            
##       overall_uninsured_High}      => {low_income_uninsured_High}  0.09898154  1.0000000 0.09898154 3.044574   311
## [4]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.05028644  0.9937107 0.05060471 3.037197   158
## [5]  {high_income_uninsured_High,                                                                                 
##       income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.05028644  0.9937107 0.05060471 3.037197   158
## [6]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {high_income_uninsured_High} 0.05060471  1.0000000 0.05060471 3.032819   159
## [7]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.05028644  1.0000000 0.05028644 3.032819   158
## [8]  {low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.23711012  0.9880637 0.23997454 2.996618   745
## [9]  {income_gap_Med,                                                                                             
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.08784214  0.9857143 0.08911521 2.989493   276
## [10] {income_gap_High,                                                                                            
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.09898154  0.9841772 0.10057288 2.984831   311
## [11] {income_gap_High,                                                                                            
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.09898154  0.9841772 0.10057288 2.984831   311
## [12] {income_gap_Med,                                                                                             
##       low_income_uninsured_High}   => {high_income_uninsured_High} 0.09452578  0.9801980 0.09643539 2.972763   297
## [13] {high_income_uninsured_High,                                                                                 
##       income_gap_Low}              => {overall_uninsured_High}     0.08561426  0.9711191 0.08816041 2.968148   269
## [14] {high_income_uninsured_Low,                                                                                  
##       income_gap_Low}              => {low_income_uninsured_Low}   0.13749204  1.0000000 0.13749204 2.958569   432
## [15] {high_income_uninsured_Low,                                                                                  
##       income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.10661999  1.0000000 0.10661999 2.958569   335
## [16] {income_gap_Med,                                                                                             
##       low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.09388924  0.9932660 0.09452578 2.958144   295
## [17] {income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.10948440  0.9971014 0.10980267 2.949993   344
## [18] {high_income_uninsured_Low,                                                                                  
##       income_gap_High}             => {overall_uninsured_Low}      0.08752387  0.9927798 0.08816041 2.948312   275
## [19] {low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.24283896  0.9857881 0.24633991 2.935873   763
## [20] {income_gap_Low,                                                                                             
##       low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.10661999  0.9738372 0.10948440 2.900281   335
# visualization
topN <- min(30, length(rules_strong))
rules_top <- head(rules_strong, topN)

inspect(rules_top)
##      lhs                              rhs                             support confidence   coverage     lift count
## [1]  {income_gap_High,                                                                                            
##       overall_uninsured_High}      => {low_income_uninsured_High}  0.10057288  1.0000000 0.10057288 3.044574   316
## [2]  {high_income_uninsured_High,                                                                                 
##       income_gap_High}             => {low_income_uninsured_High}  0.13494589  1.0000000 0.13494589 3.044574   424
## [3]  {high_income_uninsured_High,                                                                                 
##       income_gap_High,                                                                                            
##       overall_uninsured_High}      => {low_income_uninsured_High}  0.09898154  1.0000000 0.09898154 3.044574   311
## [4]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.05028644  0.9937107 0.05060471 3.037197   158
## [5]  {high_income_uninsured_High,                                                                                 
##       income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.05028644  0.9937107 0.05060471 3.037197   158
## [6]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High}   => {high_income_uninsured_High} 0.05060471  1.0000000 0.05060471 3.032819   159
## [7]  {income_gap_Low,                                                                                             
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.05028644  1.0000000 0.05028644 3.032819   158
## [8]  {low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.23711012  0.9880637 0.23997454 2.996618   745
## [9]  {income_gap_Med,                                                                                             
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.08784214  0.9857143 0.08911521 2.989493   276
## [10] {income_gap_High,                                                                                            
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.09898154  0.9841772 0.10057288 2.984831   311
## [11] {income_gap_High,                                                                                            
##       low_income_uninsured_High,                                                                                  
##       overall_uninsured_High}      => {high_income_uninsured_High} 0.09898154  0.9841772 0.10057288 2.984831   311
## [12] {income_gap_Med,                                                                                             
##       low_income_uninsured_High}   => {high_income_uninsured_High} 0.09452578  0.9801980 0.09643539 2.972763   297
## [13] {high_income_uninsured_High,                                                                                 
##       income_gap_Low}              => {overall_uninsured_High}     0.08561426  0.9711191 0.08816041 2.968148   269
## [14] {high_income_uninsured_Low,                                                                                  
##       income_gap_Low}              => {low_income_uninsured_Low}   0.13749204  1.0000000 0.13749204 2.958569   432
## [15] {high_income_uninsured_Low,                                                                                  
##       income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.10661999  1.0000000 0.10661999 2.958569   335
## [16] {income_gap_Med,                                                                                             
##       low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.09388924  0.9932660 0.09452578 2.958144   295
## [17] {income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.10948440  0.9971014 0.10980267 2.949993   344
## [18] {high_income_uninsured_Low,                                                                                  
##       income_gap_High}             => {overall_uninsured_Low}      0.08752387  0.9927798 0.08816041 2.948312   275
## [19] {low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.24283896  0.9857881 0.24633991 2.935873   763
## [20] {income_gap_Low,                                                                                             
##       low_income_uninsured_Low,                                                                                   
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.10661999  0.9738372 0.10948440 2.900281   335
## [21] {income_gap_Low,                                                                                             
##       overall_uninsured_Low}       => {high_income_uninsured_Low}  0.10661999  0.9710145 0.10980267 2.891874   335
## [22] {high_income_uninsured_Low,                                                                                  
##       income_gap_Med,                                                                                             
##       overall_uninsured_Low}       => {low_income_uninsured_Low}   0.09388924  0.9703947 0.09675366 2.870980   295
## [23] {low_income_uninsured_Med,                                                                                   
##       overall_uninsured_Med}       => {high_income_uninsured_Med}  0.17441120  0.9547038 0.18268619 2.854119   548
## [24] {high_income_uninsured_High,                                                                                 
##       income_gap_Med,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.08784214  0.9292929 0.09452578 2.840310   276
## [25] {income_gap_Med,                                                                                             
##       low_income_uninsured_High}   => {overall_uninsured_High}     0.08911521  0.9240924 0.09643539 2.824415   280
## [26] {income_gap_Med,                                                                                             
##       low_income_uninsured_Med,                                                                                   
##       overall_uninsured_Med}       => {high_income_uninsured_Med}  0.09007002  0.9433333 0.09548059 2.820127   283
## [27] {high_income_uninsured_High,                                                                                 
##       income_gap_Med}              => {overall_uninsured_High}     0.09834500  0.9223881 0.10661999 2.819206   309
## [28] {income_gap_Med,                                                                                             
##       low_income_uninsured_Low}    => {high_income_uninsured_Low}  0.10343730  0.9365994 0.11043921 2.789380   325
## [29] {high_income_uninsured_Med,                                                                                  
##       income_gap_Med,                                                                                             
##       overall_uninsured_Med}       => {low_income_uninsured_Med}   0.09007002  0.9278689 0.09707193 2.781836   283
## [30] {high_income_uninsured_Low,                                                                                  
##       income_gap_Med}              => {low_income_uninsured_Low}   0.10343730  0.9393064 0.11012094 2.779002   325

-I converted the top association rules into a data frame format to facilitate interpretation and reporting. Since the rules are originally stored as combined strings in the form {LHS} => {RHS}, I split each rule into its left-hand side (LHS) and right-hand side (RHS) components and removed the curly brackets for clarity.I then constructed a clean summary table containing the LHS, RHS, support, confidence, and lift values for each rule. Finally, I sorted the table in descending order of lift to prioritize the strongest and most meaningful associations, and printed the top results for interpretation.

rules_top_df <- as(rules_top, "data.frame")
lhs_rhs <- strsplit(as.character(rules_top_df$rules), "=>", fixed = TRUE)
lhs <- trimws(gsub("^\\{|\\}$", "", sapply(lhs_rhs, `[`, 1)))
rhs <- trimws(gsub("^\\{|\\}$", "", sapply(lhs_rhs, `[`, 2)))

rules_top_table <- data.frame(
  LHS = lhs,
  RHS = rhs,
  support = rules_top_df$support,
  confidence = rules_top_df$confidence,
  lift = rules_top_df$lift,
  stringsAsFactors = FALSE
)

rules_top_table <- rules_top_table[order(-rules_top_table$lift), ]
print(head(rules_top_table, 30))
##                                                                     LHS
## 1                               income_gap_High,overall_uninsured_High}
## 2                           high_income_uninsured_High,income_gap_High}
## 3    high_income_uninsured_High,income_gap_High,overall_uninsured_High}
## 4                             income_gap_Low,low_income_uninsured_High}
## 5  high_income_uninsured_High,income_gap_Low,low_income_uninsured_High}
## 6                             income_gap_Low,low_income_uninsured_High}
## 7      income_gap_Low,low_income_uninsured_High,overall_uninsured_High}
## 8                     low_income_uninsured_High,overall_uninsured_High}
## 9      income_gap_Med,low_income_uninsured_High,overall_uninsured_High}
## 10                              income_gap_High,overall_uninsured_High}
## 11    income_gap_High,low_income_uninsured_High,overall_uninsured_High}
## 12                            income_gap_Med,low_income_uninsured_High}
## 13                           high_income_uninsured_High,income_gap_Low}
## 14                            high_income_uninsured_Low,income_gap_Low}
## 15      high_income_uninsured_Low,income_gap_Low,overall_uninsured_Low}
## 16       income_gap_Med,low_income_uninsured_Low,overall_uninsured_Low}
## 17                                income_gap_Low,overall_uninsured_Low}
## 18                           high_income_uninsured_Low,income_gap_High}
## 19                      low_income_uninsured_Low,overall_uninsured_Low}
## 20       income_gap_Low,low_income_uninsured_Low,overall_uninsured_Low}
## 21                                income_gap_Low,overall_uninsured_Low}
## 22      high_income_uninsured_Low,income_gap_Med,overall_uninsured_Low}
## 23                      low_income_uninsured_Med,overall_uninsured_Med}
## 24 high_income_uninsured_High,income_gap_Med,low_income_uninsured_High}
## 25                            income_gap_Med,low_income_uninsured_High}
## 26       income_gap_Med,low_income_uninsured_Med,overall_uninsured_Med}
## 27                           high_income_uninsured_High,income_gap_Med}
## 28                             income_gap_Med,low_income_uninsured_Low}
## 29      high_income_uninsured_Med,income_gap_Med,overall_uninsured_Med}
## 30                            high_income_uninsured_Low,income_gap_Med}
##                            RHS    support confidence     lift
## 1   {low_income_uninsured_High 0.10057288  1.0000000 3.044574
## 2   {low_income_uninsured_High 0.13494589  1.0000000 3.044574
## 3   {low_income_uninsured_High 0.09898154  1.0000000 3.044574
## 4      {overall_uninsured_High 0.05028644  0.9937107 3.037197
## 5      {overall_uninsured_High 0.05028644  0.9937107 3.037197
## 6  {high_income_uninsured_High 0.05060471  1.0000000 3.032819
## 7  {high_income_uninsured_High 0.05028644  1.0000000 3.032819
## 8  {high_income_uninsured_High 0.23711012  0.9880637 2.996618
## 9  {high_income_uninsured_High 0.08784214  0.9857143 2.989493
## 10 {high_income_uninsured_High 0.09898154  0.9841772 2.984831
## 11 {high_income_uninsured_High 0.09898154  0.9841772 2.984831
## 12 {high_income_uninsured_High 0.09452578  0.9801980 2.972763
## 13     {overall_uninsured_High 0.08561426  0.9711191 2.968148
## 14   {low_income_uninsured_Low 0.13749204  1.0000000 2.958569
## 15   {low_income_uninsured_Low 0.10661999  1.0000000 2.958569
## 16  {high_income_uninsured_Low 0.09388924  0.9932660 2.958144
## 17   {low_income_uninsured_Low 0.10948440  0.9971014 2.949993
## 18      {overall_uninsured_Low 0.08752387  0.9927798 2.948312
## 19  {high_income_uninsured_Low 0.24283896  0.9857881 2.935873
## 20  {high_income_uninsured_Low 0.10661999  0.9738372 2.900281
## 21  {high_income_uninsured_Low 0.10661999  0.9710145 2.891874
## 22   {low_income_uninsured_Low 0.09388924  0.9703947 2.870980
## 23  {high_income_uninsured_Med 0.17441120  0.9547038 2.854119
## 24     {overall_uninsured_High 0.08784214  0.9292929 2.840310
## 25     {overall_uninsured_High 0.08911521  0.9240924 2.824415
## 26  {high_income_uninsured_Med 0.09007002  0.9433333 2.820127
## 27     {overall_uninsured_High 0.09834500  0.9223881 2.819206
## 28  {high_income_uninsured_Low 0.10343730  0.9365994 2.789380
## 29   {low_income_uninsured_Med 0.09007002  0.9278689 2.781836
## 30   {low_income_uninsured_Low 0.10343730  0.9393064 2.779002
# outpot
write.csv(rules_top_table, "association_rules_top_metrics.csv", row.names = FALSE)
write.csv(as(rules_top, "data.frame"), "association_rules_top.csv", row.names = FALSE)

# visualization
plot(rules_top, method = "scatterplot", measure = c("support","confidence"), shading = "lift")

plot(rules_top, method = "matrix", measure = "lift")
## Itemsets in Antecedent (LHS)
##  [1] "{high_income_uninsured_High,income_gap_High}"                         
##  [2] "{high_income_uninsured_High,income_gap_High,overall_uninsured_High}"  
##  [3] "{high_income_uninsured_High,income_gap_Low,low_income_uninsured_High}"
##  [4] "{income_gap_Low,low_income_uninsured_High}"                           
##  [5] "{income_gap_Low,low_income_uninsured_High,overall_uninsured_High}"    
##  [6] "{income_gap_High,overall_uninsured_High}"                             
##  [7] "{low_income_uninsured_High,overall_uninsured_High}"                   
##  [8] "{income_gap_Med,low_income_uninsured_High,overall_uninsured_High}"    
##  [9] "{income_gap_High,low_income_uninsured_High,overall_uninsured_High}"   
## [10] "{high_income_uninsured_High,income_gap_Low}"                          
## [11] "{high_income_uninsured_Low,income_gap_Low}"                           
## [12] "{high_income_uninsured_Low,income_gap_Low,overall_uninsured_Low}"     
## [13] "{income_gap_Med,low_income_uninsured_Low,overall_uninsured_Low}"      
## [14] "{high_income_uninsured_Low,income_gap_High}"                          
## [15] "{low_income_uninsured_Low,overall_uninsured_Low}"                     
## [16] "{income_gap_Low,overall_uninsured_Low}"                               
## [17] "{income_gap_Low,low_income_uninsured_Low,overall_uninsured_Low}"      
## [18] "{income_gap_Med,low_income_uninsured_High}"                           
## [19] "{high_income_uninsured_Low,income_gap_Med,overall_uninsured_Low}"     
## [20] "{low_income_uninsured_Med,overall_uninsured_Med}"                     
## [21] "{high_income_uninsured_High,income_gap_Med,low_income_uninsured_High}"
## [22] "{income_gap_Med,low_income_uninsured_Med,overall_uninsured_Med}"      
## [23] "{high_income_uninsured_High,income_gap_Med}"                          
## [24] "{income_gap_Med,low_income_uninsured_Low}"                            
## [25] "{high_income_uninsured_Med,income_gap_Med,overall_uninsured_Med}"     
## [26] "{high_income_uninsured_Low,income_gap_Med}"                           
## Itemsets in Consequent (RHS)
## [1] "{low_income_uninsured_Med}"   "{high_income_uninsured_Med}" 
## [3] "{high_income_uninsured_Low}"  "{low_income_uninsured_Low}"  
## [5] "{overall_uninsured_High}"     "{overall_uninsured_Low}"     
## [7] "{high_income_uninsured_High}" "{low_income_uninsured_High}"

plot(rules_top, method = "graph", engine = "htmlwidget")

18 Association Rule Interpretation


19 Conclusion