sequence analysis

This markdown file shows the sequence analysis based on raw IPC scores

Starts by importing IPC 2024 version

ipc_2024= read_csv("/Volumes/Nafeesa/NCHS data/Cleaning NCHS Do-Files/IPC Final Full Data 2024 Update.csv", col_names = TRUE) %>% 
  janitor::clean_names() %>% 
  mutate(year= as.character(year),
         secure_communities= if_else(as.double(year)>2014, NA_integer_, as.double(secure_communities)),
         total = rowSums(across(where(is.numeric)), na.rm = TRUE))

## Rows: 765 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): state, abv
## dbl (18): year, pub_insurance_immigrant_kids, prenatal_care_pregnant_immigra...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The following code creates a dataframe of states (or statuses, not states in the USA “very exclusionary”,“somewhat exclusionary”,“somewhat inclusionary”, “very inclusionary” ) and them a state sequence opject

absolute.sts=  ipc_2024 %>%
  dplyr::select(state, year, total) %>%
  mutate(relative_level =cut(total,breaks=c(-14, -6,-1,5, 14),labels=c("very exclusionary","somewhat exclusionary","somewhat inclusionary", "very inclusionary"))) %>% 
  dplyr::select(-total) %>% 
  pivot_wider(names_from = year, names_prefix = "y", values_from = relative_level) %>% 
  dplyr::select(-state) %>% 
  data.frame()

#creates ID vector
states= ipc_2024 %>%
  distinct(state)
id= states$state


absolute.sso= seqdef(absolute.sts, id=id, alphabet =c("very exclusionary","somewhat exclusionary","somewhat inclusionary", "very inclusionary"))

##  [>] found missing values ('NA') in sequence data

##  [>] preparing 51 sequences

##  [>] coding void elements with '%' and missing values with '*'

##  [!!] 12 empty sequence(s) with index: 1,2,3,11,13,15,23,25,26,34, ...
##       may produce inconsistent results.

##  [>] 4 distinct states appear in the data:

##      1 = somewhat exclusionary

##      2 = somewhat inclusionary

##      3 = very exclusionary

##      4 = very inclusionary

##  [>] state coding:

##        [alphabet]            [label]               [long label]

##      1  very exclusionary     very exclusionary     very exclusionary

##      2  somewhat exclusionary somewhat exclusionary somewhat exclusionary

##      3  somewhat inclusionary somewhat inclusionary somewhat inclusionary

##      4  very inclusionary     very inclusionary     very inclusionary

##  [>] 51 sequences in the data set

##  [>] min/max sequence length: 0/15

OM calculates how different each sequence is from the other sequences

Then we do a cluser analysis to determine how many clusters of sequences is best

library(WeightedCluster)

## This is WeightedCluster stable version 1.8-1 (Built: 2024-12-10)

## 
## To access available manuals and short tutorials, please run:

##    vignette("WeightedCluster") ## For the complete manual in English

##    vignette(package="WeightedCluster") ## To list available documentation

## 
## To cite WeightedCluster in publications please use or references in the help pages:

## Studer, Matthias (2013). WeightedCluster Library Manual: A practical

##    guide to creating typologies of trajectories in the social sciences

##    with R. LIVES Working Papers, 24. doi:

##    10.12682/lives.2296-1658.2013.24

absolute.om=seqdist(absolute.sso, method = "OM", indel = 1, sm = "TRATE", with.missing=TRUE)

##  [!!] Empty sequence(s)  1, 2, 3, 11, 13, 15, 23, 25, 26, 34, 35, 41  may cause inconsistent results!

##  [>] including missing values as an additional state

##  [>] 51 sequences with 5 distinct states

##  [>] Computing sm with seqcost using  TRATE

##  [>] creating substitution-cost matrix using transition rates ...

##  [>] computing transition probabilities for states very exclusionary/somewhat exclusionary/somewhat inclusionary/very inclusionary/* ...

##  [>] 38 distinct  sequences

##  [>] min/max sequence lengths: 0/15

##  [>] computing distances using the OM metric

##  [>] elapsed time: 0.011 secs

clusterward = agnes(absolute.om, diss = TRUE, method = "ward")
abs.ward0 = hclust(as.dist(absolute.om), method = "ward.D")
abs.wardtest=as.clustrange(abs.ward0, diss = absolute.om, ncluster = 7)
abs.wardtest

##           PBC   HG HGSD  ASW ASWw    CH   R2  CHsq R2sq   HC
## cluster2 0.57 0.70 0.69 0.41 0.44 18.22 0.27 33.36 0.41 0.12
## cluster3 0.50 0.58 0.58 0.31 0.34 15.47 0.39 22.30 0.48 0.19
## cluster4 0.56 0.68 0.68 0.37 0.41 15.44 0.50 26.31 0.63 0.14
## cluster5 0.63 0.82 0.82 0.42 0.48 14.82 0.56 30.03 0.72 0.09
## cluster6 0.63 0.84 0.83 0.44 0.51 14.83 0.62 30.50 0.77 0.08
## cluster7 0.65 0.89 0.89 0.47 0.55 14.71 0.67 31.95 0.81 0.06

plot(abs.wardtest,  lwd = 4)

plot(abs.wardtest, norm = "zscore", lwd = 4)

plot(abs.wardtest, stat = c("ASW", "HC", "PBC"), norm = "zscore", lwd = 4)

For these goodness of fit tests I think three clusters of sequences is best fit, but four clusters is also acceptible

Next we look at the three cluster solution and the four cluster solution to see if one makes more sense conceptually.

seq d plots should the distribution of ipc category in each measurement point. !! IMPORTENT rows do not represent actual US stats and their sequences!!

cpal(absolute.sso)= c("#d7191c", "#fdae61","#abd9e9","#2c7bb6")

absolute.cl3 = cutree(clusterward, k = 3) 
absolute3.lab = factor(absolute.cl3, labels = paste("Cluster", 1:3))

seqdplot(absolute.sso, group = absolute3.lab,border = NA)

absolute.cl4 = cutree(clusterward, k = 4) 
absolute4.lab = factor(absolute.cl4, labels = paste("Cluster", 1:4))

seqdplot(absolute.sso, group = absolute4.lab,border = NA)

all3= cbind(absolute.sso , absolute.cl4,absolute.cl3 ) %>% 
  seqdef(var= names(absolute.sso), alphabet =c("very exclusionary","somewhat exclusionary","somewhat inclusionary", "very inclusionary", "*", "%"))

To look at US state trajectories. It looks way better and makes more sense sorted by starting value

cpal(absolute.sso) = c("#d7191c", "#fdae61", "#abd9e9", "#2c7bb6")

ipc_first= ipc_2024 %>% 
  filter(year==2009)

seqIplot(all3, group= absolute.cl3, sortv= desc(ipc_first$total), ylab=NA, ytlab = "id", las=1, bty="n", cex.axis=.62)

seqIplot(all3, group= absolute.cl4, main = c("Very exclusionary", "Somewhat exclusionary", "More inclusive over time", "Exclusionary, moving towards neutral"), sortv= desc(ipc_first$total), ylab=NA, ytlab = "id", las=1, bty="n", cex.axis=.62)

The three cluster solution seems more like low, medium, high. the four cluster solution centers change more

But, Texas doesn’t fit conceptually. Massachusets could also be moved

The next chunk puts the three and four cluster assignments onto the IPC dataset and moves Texas

one_row_per_state= cbind(absolute.sso, absolute.cl4, absolute.cl3) %>% 
  mutate(state= rownames(.),
         cluster3=if_else(state=="Texas", 2, absolute.cl3),
         cluster4=if_else(state=="Texas", 2, absolute.cl4))

just_clusters= one_row_per_state %>% 
  select(state, cluster3, cluster4)

IPC_w_seq_cluster= left_join(ipc_2024, just_clusters, by="state")

sequence analysis

Amanda Nagle

2025-02-14