This markdown file shows the sequence analysis based on raw IPC scores
Starts by importing IPC 2024 version
ipc_2024= read_csv("/Volumes/Nafeesa/NCHS data/Cleaning NCHS Do-Files/IPC Final Full Data 2024 Update.csv", col_names = TRUE) %>%
janitor::clean_names() %>%
mutate(year= as.character(year),
secure_communities= if_else(as.double(year)>2014, NA_integer_, as.double(secure_communities)),
total = rowSums(across(where(is.numeric)), na.rm = TRUE))
## Rows: 765 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, abv
## dbl (18): year, pub_insurance_immigrant_kids, prenatal_care_pregnant_immigra...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The following code creates a dataframe of states (or statuses, not states in the USA “very exclusionary”,“somewhat exclusionary”,“somewhat inclusionary”, “very inclusionary” ) and them a state sequence opject
absolute.sts= ipc_2024 %>%
dplyr::select(state, year, total) %>%
mutate(relative_level =cut(total,breaks=c(-14, -6,-1,5, 14),labels=c("very exclusionary","somewhat exclusionary","somewhat inclusionary", "very inclusionary"))) %>%
dplyr::select(-total) %>%
pivot_wider(names_from = year, names_prefix = "y", values_from = relative_level) %>%
dplyr::select(-state) %>%
data.frame()
#creates ID vector
states= ipc_2024 %>%
distinct(state)
id= states$state
absolute.sso= seqdef(absolute.sts, id=id, alphabet =c("very exclusionary","somewhat exclusionary","somewhat inclusionary", "very inclusionary"))
## [>] found missing values ('NA') in sequence data
## [>] preparing 51 sequences
## [>] coding void elements with '%' and missing values with '*'
## [!!] 12 empty sequence(s) with index: 1,2,3,11,13,15,23,25,26,34, ...
## may produce inconsistent results.
## [>] 4 distinct states appear in the data:
## 1 = somewhat exclusionary
## 2 = somewhat inclusionary
## 3 = very exclusionary
## 4 = very inclusionary
## [>] state coding:
## [alphabet] [label] [long label]
## 1 very exclusionary very exclusionary very exclusionary
## 2 somewhat exclusionary somewhat exclusionary somewhat exclusionary
## 3 somewhat inclusionary somewhat inclusionary somewhat inclusionary
## 4 very inclusionary very inclusionary very inclusionary
## [>] 51 sequences in the data set
## [>] min/max sequence length: 0/15
OM calculates how different each sequence is from the other sequences
Then we do a cluser analysis to determine how many clusters of sequences is best
library(WeightedCluster)
## This is WeightedCluster stable version 1.8-1 (Built: 2024-12-10)
##
## To access available manuals and short tutorials, please run:
## vignette("WeightedCluster") ## For the complete manual in English
## vignette(package="WeightedCluster") ## To list available documentation
##
## To cite WeightedCluster in publications please use or references in the help pages:
## Studer, Matthias (2013). WeightedCluster Library Manual: A practical
## guide to creating typologies of trajectories in the social sciences
## with R. LIVES Working Papers, 24. doi:
## 10.12682/lives.2296-1658.2013.24
absolute.om=seqdist(absolute.sso, method = "OM", indel = 1, sm = "TRATE", with.missing=TRUE)
## [!!] Empty sequence(s) 1, 2, 3, 11, 13, 15, 23, 25, 26, 34, 35, 41 may cause inconsistent results!
## [>] including missing values as an additional state
## [>] 51 sequences with 5 distinct states
## [>] Computing sm with seqcost using TRATE
## [>] creating substitution-cost matrix using transition rates ...
## [>] computing transition probabilities for states very exclusionary/somewhat exclusionary/somewhat inclusionary/very inclusionary/* ...
## [>] 38 distinct sequences
## [>] min/max sequence lengths: 0/15
## [>] computing distances using the OM metric
## [>] elapsed time: 0.011 secs
clusterward = agnes(absolute.om, diss = TRUE, method = "ward")
abs.ward0 = hclust(as.dist(absolute.om), method = "ward.D")
abs.wardtest=as.clustrange(abs.ward0, diss = absolute.om, ncluster = 7)
abs.wardtest
## PBC HG HGSD ASW ASWw CH R2 CHsq R2sq HC
## cluster2 0.57 0.70 0.69 0.41 0.44 18.22 0.27 33.36 0.41 0.12
## cluster3 0.50 0.58 0.58 0.31 0.34 15.47 0.39 22.30 0.48 0.19
## cluster4 0.56 0.68 0.68 0.37 0.41 15.44 0.50 26.31 0.63 0.14
## cluster5 0.63 0.82 0.82 0.42 0.48 14.82 0.56 30.03 0.72 0.09
## cluster6 0.63 0.84 0.83 0.44 0.51 14.83 0.62 30.50 0.77 0.08
## cluster7 0.65 0.89 0.89 0.47 0.55 14.71 0.67 31.95 0.81 0.06
plot(abs.wardtest, lwd = 4)
plot(abs.wardtest, norm = "zscore", lwd = 4)
plot(abs.wardtest, stat = c("ASW", "HC", "PBC"), norm = "zscore", lwd = 4)
For these goodness of fit tests I think three clusters of sequences is best fit, but four clusters is also acceptible
Next we look at the three cluster solution and the four cluster solution to see if one makes more sense conceptually.
seq d plots should the distribution of ipc category in each measurement point. !! IMPORTENT rows do not represent actual US stats and their sequences!!
cpal(absolute.sso)= c("#d7191c", "#fdae61","#abd9e9","#2c7bb6")
absolute.cl3 = cutree(clusterward, k = 3)
absolute3.lab = factor(absolute.cl3, labels = paste("Cluster", 1:3))
seqdplot(absolute.sso, group = absolute3.lab,border = NA)
absolute.cl4 = cutree(clusterward, k = 4)
absolute4.lab = factor(absolute.cl4, labels = paste("Cluster", 1:4))
seqdplot(absolute.sso, group = absolute4.lab,border = NA)
all3= cbind(absolute.sso , absolute.cl4,absolute.cl3 ) %>%
seqdef(var= names(absolute.sso), alphabet =c("very exclusionary","somewhat exclusionary","somewhat inclusionary", "very inclusionary", "*", "%"))
To look at US state trajectories. It looks way better and makes more sense sorted by starting value
cpal(absolute.sso) = c("#d7191c", "#fdae61", "#abd9e9", "#2c7bb6")
ipc_first= ipc_2024 %>%
filter(year==2009)
seqIplot(all3, group= absolute.cl3, sortv= desc(ipc_first$total), ylab=NA, ytlab = "id", las=1, bty="n", cex.axis=.62)
seqIplot(all3, group= absolute.cl4, main = c("Very exclusionary", "Somewhat exclusionary", "More inclusive over time", "Exclusionary, moving towards neutral"), sortv= desc(ipc_first$total), ylab=NA, ytlab = "id", las=1, bty="n", cex.axis=.62)
The three cluster solution seems more like low, medium, high. the four cluster solution centers change more
But, Texas doesn’t fit conceptually. Massachusets could also be moved
The next chunk puts the three and four cluster assignments onto the IPC dataset and moves Texas
one_row_per_state= cbind(absolute.sso, absolute.cl4, absolute.cl3) %>%
mutate(state= rownames(.),
cluster3=if_else(state=="Texas", 2, absolute.cl3),
cluster4=if_else(state=="Texas", 2, absolute.cl4))
just_clusters= one_row_per_state %>%
select(state, cluster3, cluster4)
IPC_w_seq_cluster= left_join(ipc_2024, just_clusters, by="state")