- Install packages and load dataset.
## R markdown link: https://rpubs.com/jasohosk/1424635
## Run libraries
packages <- c(
"haven",
"tidyverse",
"psych",
"ggplot2",
"tidySEM",
"OpenMx",
"gmodels",
"mice",
"nnet",
"future",
"car",
"mlogit",
"dfidx",
"tidyLPA",
"broom"
)
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
install.packages(packages[!installed])
}
lapply(packages, library, character.only = TRUE)
## Set working library
setwd(
"/Users/jasonhoskin/Library/CloudStorage/OneDrive-IndianaUniversity/study-gambling"
)
## Import dataset
url <- "ICSUS_2025_State_18 to 25_CLN.sav"
df1 <- read_sav(url)
## Remove unnecessary variables
rm(installed, packages, url)
## Set seed
set.seed(123)
- Run descriptive statistics and simplify dataset.
## Sample size and number of variables
dim(df1)
## [1] 5402 257
## Prepare demographics
age <- df1 |>
mutate(age = case_when(
age < 5 ~ 0,
## < 21
age >= 5 ~ 1 ## 21+
)) |>
dplyr::select(autoid, age)
ethnicity <- df1 |>
mutate(ethnicity = case_when(
ethnic == 2 ~ 0,
## Not Hispanic/Latino
ethnic == 1 ~ 1 ## Hispanic/Latino
)) |>
dplyr::select(autoid, ethnicity)
race <- df1 |>
mutate(race = case_when(
race == 1 ~ 0,
## White
race == 2 ~ 1,
## Black/African American
race == 3 ~ 2,
## Asian
race == 4 ~ 4,
## Other
race == 5 ~ 4,
## Other
race == 6 ~ 3,
## Multiracial
race == 7 ~ 4,
## Other
)) |>
dplyr::select(autoid, race)
gender <- df1 |>
mutate(
gender = case_when(
gender3 == 1 ~ 0,
# Female
gender2 == 1 ~ 1,
# Male
gender1 == 1 ~ 2,
# Other
gender4 == 1 ~ 2,
# Other
gender5 == 1 ~ 2,
# Other
gender6 == 1 ~ 2,
# Other
gender7 == 1 ~ 2,
# Other
gender8 == 1 ~ 2 # Other
)
) |>
dplyr::select(autoid, gender)
greekmem <- df1 |>
mutate(greekmem = case_when(
greekmem == 2 ~ 0,
# No Greek Membership
greekmem == 1 ~ 1 # Greek Membership
)) |>
dplyr::select(autoid, greekmem)
studentstatus <- df1 |>
mutate(studentstatus = case_when(
stustat == 1 ~ 0 ## Full-Time
,
stustat == 2 ~ 1 ## Part-Time
)) |>
dplyr::select(autoid, studentstatus)
residence <- df1 |>
mutate(residence = case_when(
residenc < 4 ~ 0,
## Off Campus
residenc >= 4 ~ 1
)) |> ## On Campus
dplyr::select(autoid, residence)
## Combine dems
dfs_to_join <- list(age, ethnicity, race, gender, studentstatus, residence, greekmem)
# Use reduce to join them all by "uid"
combined_demographics <- dfs_to_join %>%
reduce(full_join, by = "autoid")
## Simplify dataset
df2 <- df1 |>
dplyr::select(autoid |
smokmo:othmo |
gmpools:gmoth |
gcqsleep:gcqdepress |
mhdays)
## Combine prepared dems to new dataset
dems_to_join <- list(combined_demographics, df2)
df2 <- dems_to_join %>%
reduce(full_join, by = "autoid")
## New sample size and number of variables
dim(df2)
## [1] 5402 46
## Calculate frequencies
freq_list <- lapply(df2[, 2:45], table, useNA = "ifany")
freq_list
## $age
##
## 0 1
## 3087 2315
##
## $ethnicity
##
## 0 1 <NA>
## 4419 670 313
##
## $race
##
## 0 1 2 3 4 <NA>
## 4148 280 487 245 194 48
##
## $gender
##
## 0 1 2 <NA>
## 3318 1802 255 27
##
## $studentstatus
##
## 0 1 <NA>
## 5168 216 18
##
## $residence
##
## 0 1 <NA>
## 2602 2797 3
##
## $greekmem
##
## 0 1 <NA>
## 4667 727 8
##
## $smokmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 4106 869 238 86 37 33 18 12 3
##
## $cigarmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 4538 683 125 21 6 9 1 3 16
##
## $snufmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 4993 248 42 22 14 15 13 44 11
##
## $hookmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 5027 306 31 11 9 1 2 4 11
##
## $ecigmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 3445 1013 242 115 55 55 88 373 16
##
## $alcmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 1424 1258 978 708 412 282 115 161 64
##
## $marmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 3237 1049 321 170 126 124 161 202 12
##
## $cocmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 5275 97 9 6 2 2 1 1 9
##
## $hallumo
##
## 1 2 3 4 5 6 7 <NA>
## 5074 269 34 9 3 1 2 10
##
## $hermo
##
## 1 2 3 4 5 <NA>
## 5378 11 1 1 1 10
##
## $methmo
##
## 1 2 3 4 5 6 8 <NA>
## 5367 20 1 1 1 2 1 9
##
## $inhmo
##
## 1 2 3 4 5 6 7 <NA>
## 5271 94 13 7 4 2 2 9
##
## $rxstmo
##
## 1 2 3 4 5 6 7 8 <NA>
## 5152 182 33 11 4 2 5 2 11
##
## $rxpkmo
##
## 1 2 3 4 5 6 7 <NA>
## 5259 119 10 3 1 1 2 7
##
## $rxsedmo
##
## 1 2 3 4 5 6 7 <NA>
## 5286 92 7 6 2 1 2 6
##
## $othmo
##
## 1 2 3 4 5 6 <NA>
## 5272 92 13 6 2 3 14
##
## $gmpools
##
## 1 2 3 4 <NA>
## 3887 278 77 27 1133
##
## $gmfant
##
## 1 2 3 4 <NA>
## 4013 162 53 35 1139
##
## $gmvideo
##
## 1 2 3 4 <NA>
## 3782 342 112 36 1130
##
## $gmonsport
##
## 1 2 3 4 <NA>
## 3921 167 103 78 1133
##
## $gmothsp
##
## 1 2 3 4 <NA>
## 4136 64 33 32 1137
##
## $gmonline
##
## 1 2 3 4 <NA>
## 4076 127 45 21 1133
##
## $gmesports
##
## 1 2 3 4 <NA>
## 4180 42 24 22 1134
##
## $gmhorse
##
## 1 2 3 4 <NA>
## 4175 73 8 9 1137
##
## $gmcards
##
## 1 2 3 4 <NA>
## 3885 262 93 27 1135
##
## $gmlott
##
## 1 2 3 4 <NA>
## 3274 876 100 18 1134
##
## $gmcasino
##
## 1 2 3 4 <NA>
## 3930 297 30 7 1138
##
## $gmcharit
##
## 1 2 3 4 <NA>
## 3831 393 35 8 1135
##
## $gmoth
##
## 1 2 3 4 <NA>
## 4132 62 16 13 1179
##
## $gcqsleep
##
## 1 2 3 <NA>
## 1748 32 6 3616
##
## $gcqhyg
##
## 1 2 3 <NA>
## 1770 15 1 3616
##
## $gcqfrnd
##
## 1 2 3 <NA>
## 1771 15 1 3615
##
## $gcqfmly
##
## 1 2 3 <NA>
## 1760 18 7 3617
##
## $gcqacad
##
## 1 2 3 <NA>
## 1764 15 4 3619
##
## $gcqmoney
##
## 1 2 3 <NA>
## 1705 64 15 3618
##
## $gcqbad
##
## 1 2 3 <NA>
## 1650 123 10 3619
##
## $gcqdepress
##
## 1 2 3 <NA>
## 1743 28 3 3628
- Assess NAs and remove all participants with no responses to the
gambling behaviors items.
colSums(is.na(df2))
## autoid age ethnicity race gender
## 0 0 313 48 27
## studentstatus residence greekmem smokmo cigarmo
## 18 3 8 3 16
## snufmo hookmo ecigmo alcmo marmo
## 11 11 16 64 12
## cocmo hallumo hermo methmo inhmo
## 9 10 10 9 9
## rxstmo rxpkmo rxsedmo othmo gmpools
## 11 7 6 14 1133
## gmfant gmvideo gmonsport gmothsp gmonline
## 1139 1130 1133 1137 1133
## gmesports gmhorse gmcards gmlott gmcasino
## 1134 1137 1135 1134 1138
## gmcharit gmoth gcqsleep gcqhyg gcqfrnd
## 1135 1179 3616 3616 3615
## gcqfmly gcqacad gcqmoney gcqbad gcqdepress
## 3617 3619 3618 3619 3628
## mhdays
## 1086
df3 <- df2 |>
filter(!if_all(
c(
gmpools,
gmfant,
gmvideo,
gmonsport,
gmothsp,
gmonline,
gmesports,
gmhorse,
gmcards,
gmlott,
gmcasino,
gmcharit,
gmoth
),
is.na
))
## Number of participants removed from dataset:
dim(df2)[1] - dim(df3)[1]
## [1] 1124
colSums(is.na(df3))
## autoid age ethnicity race gender
## 0 0 235 38 24
## studentstatus residence greekmem smokmo cigarmo
## 11 2 5 2 14
## snufmo hookmo ecigmo alcmo marmo
## 9 11 13 44 8
## cocmo hallumo hermo methmo inhmo
## 6 8 9 7 6
## rxstmo rxpkmo rxsedmo othmo gmpools
## 10 6 6 8 9
## gmfant gmvideo gmonsport gmothsp gmonline
## 15 6 9 13 9
## gmesports gmhorse gmcards gmlott gmcasino
## 10 13 11 10 14
## gmcharit gmoth gcqsleep gcqhyg gcqfrnd
## 11 55 2492 2492 2491
## gcqfmly gcqacad gcqmoney gcqbad gcqdepress
## 2493 2495 2494 2495 2504
## mhdays
## 34
- Discriminate skip logic NAs from missing data NAs.
## Create a gambling participation indicator
gm_vars <- names(df3)[25:37] # gmpools:gmoth
df3$any_gamble <- apply(df3[gm_vars], 1, function(x) {
if (all(is.na(x)))
return(NA)
if (all(x == 1, na.rm = TRUE))
return(0) # never gambled
return(1) # gambled at least once
})
## Create a function to separate skip logic NAs from missing data NAs
classify_gcq_na <- function(data, gate_var, consequence_vars) {
gate <- data[[gate_var]]
results <- lapply(consequence_vars, function(var) {
item <- data[[var]]
skip_na <- sum(is.na(item) & gate == 0, na.rm = TRUE)
true_na <- sum(is.na(item) & gate == 1, na.rm = TRUE)
observed <- sum(!is.na(item))
data.frame(
variable = var,
skip_logic_na = skip_na,
true_missing_na = true_na,
observed = observed
)
})
do.call(rbind, results)
}
## Apply function to dataset
gcq_vars <- names(df3)[38:45] # gcqsleep:gcqdepress
na_summary <- classify_gcq_na(data = df3,
gate_var = "any_gamble",
consequence_vars = gcq_vars)
na_summary
## variable skip_logic_na true_missing_na observed
## 1 gcqsleep 2462 30 1786
## 2 gcqhyg 2462 30 1786
## 3 gcqfrnd 2462 29 1787
## 4 gcqfmly 2462 31 1785
## 5 gcqacad 2462 33 1783
## 6 gcqmoney 2462 32 1784
## 7 gcqbad 2462 33 1783
## 8 gcqdepress 2462 42 1774
- Make the variables dichotomous and convert to class “factor”.
df4 <- df3 |>
mutate(across(
.cols = starts_with("gm"),
.fns = ~ case_when(.x == 1 ~ 1, .x > 1 ~ 2, .default = NA),
.names = "{.col}_dich"
))
df5 <- df4 |>
mutate(across(
.cols = starts_with("gcq"),
.fns = ~ case_when(.x == 1 ~ 1, .x > 1 ~ 2, .default = NA),
.names = "{.col}_dich"
))
df6 <- df5 |>
mutate(across(
.cols = ends_with("mo"),
.fns = ~ case_when(.x == 1 ~ 1, .x > 1 ~ 2, .default = NA),
.names = "{.col}_dich"
))
## Filter out unnecessary variables
df6 <- df6 |>
dplyr::select(autoid:greekmem | gmpools_dich:othmo_dich | mhdays)
# Convert substances and gambling to factor with 1 as reference.
cols_to_factor <- colnames(df6)[9:45]
df6[cols_to_factor] <- lapply(df6[cols_to_factor], function(x) {
x <- as.factor(x)
x <- relevel(x, ref = "1")
return(x)
})
## Convert demographics to class "factor"
df6$age <- factor(df6$age,
levels = c(0, 1),
labels = c("<21 (ref)", "21+"))
df6$ethnicity <- factor(
df6$ethnicity,
levels = c(0, 1),
labels = c("Not Hispanic/Latino (ref)", "Hispanic/Latino")
)
df6$race <- factor(
df6$race,
levels = c(0, 1, 2, 3, 4),
labels = c(
"White (ref)",
"African American/Black",
"Asian",
"Multiracial",
"Other"
)
)
df6$gender <- factor(
df6$gender,
levels = c(0, 1, 2),
labels = c("Female (ref)", "Male", "Other")
)
df6$studentstatus <- factor(
df6$studentstatus,
levels = c(0, 1),
labels = c("Full Time (ref)", "Part Time")
)
df6$residence <- factor(
df6$residence,
levels = c(0, 1),
labels = c("Off Campus (ref)", "On Campus")
)
df6$greekmem <- factor(df6$greekmem,
levels = c(0, 1),
labels = c("No Greek (ref)", "Greek"))
# Reference group verification
demo_vars <- c("age",
"ethnicity",
"race",
"gender",
"studentstatus",
"residence",
"greekmem")
for (v in demo_vars) {
cat(sprintf(
"%-16s | Reference: %-30s | Levels: %s\n",
v,
levels(df6[[v]])[1],
paste(levels(df6[[v]]), collapse = " / ")
))
}
## age | Reference: <21 (ref) | Levels: <21 (ref) / 21+
## ethnicity | Reference: Not Hispanic/Latino (ref) | Levels: Not Hispanic/Latino (ref) / Hispanic/Latino
## race | Reference: White (ref) | Levels: White (ref) / African American/Black / Asian / Multiracial / Other
## gender | Reference: Female (ref) | Levels: Female (ref) / Male / Other
## studentstatus | Reference: Full Time (ref) | Levels: Full Time (ref) / Part Time
## residence | Reference: Off Campus (ref) | Levels: Off Campus (ref) / On Campus
## greekmem | Reference: No Greek (ref) | Levels: No Greek (ref) / Greek
lapply(df6[demo_vars], table, useNA = "ifany")
## $age
##
## <21 (ref) 21+
## 2407 1871
##
## $ethnicity
##
## Not Hispanic/Latino (ref) Hispanic/Latino <NA>
## 3525 518 235
##
## $race
##
## White (ref) African American/Black Asian
## 3309 222 379
## Multiracial Other <NA>
## 190 140 38
##
## $gender
##
## Female (ref) Male Other <NA>
## 2683 1346 225 24
##
## $studentstatus
##
## Full Time (ref) Part Time <NA>
## 4099 168 11
##
## $residence
##
## Off Campus (ref) On Campus <NA>
## 2045 2231 2
##
## $greekmem
##
## No Greek (ref) Greek <NA>
## 3741 532 5
- Conduct the first LCA using the
tidySEM package.
## Set seed
set.seed(123)
df6_lca1 <- df6 |>
dplyr::select(gmpools_dich:gmoth_dich) |>
as.data.frame()
## Run first LCA (code used for knitting)
lca1 <- readRDS("lca1_results.rds")
## Run first LCA (code used for running first time)
## lca1 <- tidySEM::mx_lca(data = df6_lca1,
## classes = 1:4,
## missing = "fiml",
## mutate(across(gmpools_dich:gcqdepress_dich, as.ordered))
## Save LCA for future knitting
## lca1 <- saveRDS(lca1, file = "lca1_results.rds")
- Assess latent classes for first LCA.
fit1 <- table_fit(lca1)
fit1
## Name Classes LL n Parameters AIC BIC saBIC Entropy
## 1 1 1 -13815.68 4278 13 27657.36 27740.06 27698.75 1.0000000
## 2 2 2 -12071.01 4278 27 24196.01 24367.77 24281.97 0.8538182
## 3 3 3 -11805.73 4278 41 23693.46 23954.27 23823.99 0.7098784
## 4 4 4 -11697.07 4278 55 23504.14 23854.00 23679.24 0.7785529
## prob_min prob_max n_min n_max np_ratio np_local
## 1 1.0000000 1.0000000 1.00000000 1.0000000 329.07692 329.076923
## 2 0.8703959 0.9827206 0.16432913 0.8356709 158.44444 54.076923
## 3 0.7706319 0.9206315 0.10402057 0.6280972 104.34146 34.230769
## 4 0.5839871 0.9831005 0.01496026 0.7515194 77.78182 4.923077
# Lo-Mendell-Rubin Adjusted Likelihood Ratio Test (LMR-ALRT)
## 1-class vs 2-class
calc_lrt(
n = as.numeric(fit1[1, 4]),
null_ll = as.numeric(fit1[1, 3]),
null_param = as.numeric(fit1[1, 5]),
null_classes = as.numeric(fit1[1, 2]),
alt_ll = as.numeric(fit1[2, 3]),
alt_param = as.numeric(fit1[2, 5]),
alt_classes = as.numeric(fit1[2, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 3489.348, LMR LR (df = 14) = 3355.573, p < 0.001
## 1-class vs 3-class
calc_lrt(
n = as.numeric(fit1[1, 4]),
null_ll = as.numeric(fit1[1, 3]),
null_param = as.numeric(fit1[1, 5]),
null_classes = as.numeric(fit1[1, 2]),
alt_ll = as.numeric(fit1[3, 3]),
alt_param = as.numeric(fit1[3, 5]),
alt_classes = as.numeric(fit1[3, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 4019.897, LMR LR (df = 28) = 3941.333, p < 0.001
## 1-class vs 4-class
calc_lrt(
n = as.numeric(fit1[1, 4]),
null_ll = as.numeric(fit1[1, 3]),
null_param = as.numeric(fit1[1, 5]),
null_classes = as.numeric(fit1[1, 2]),
alt_ll = as.numeric(fit1[4, 3]),
alt_param = as.numeric(fit1[4, 5]),
alt_classes = as.numeric(fit1[4, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 4237.225, LMR LR (df = 42) = 4181.656, p < 0.001
## 2-class vs 3-class
calc_lrt(
n = as.numeric(fit1[2, 4]),
null_ll = as.numeric(fit1[2, 3]),
null_param = as.numeric(fit1[2, 5]),
null_classes = as.numeric(fit1[2, 2]),
alt_ll = as.numeric(fit1[3, 3]),
alt_param = as.numeric(fit1[3, 5]),
alt_classes = as.numeric(fit1[3, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 530.549, LMR LR (df = 14) = 510.208, p < 0.001
## 2-class vs 4-class
calc_lrt(
n = as.numeric(fit1[2, 4]),
null_ll = as.numeric(fit1[2, 3]),
null_param = as.numeric(fit1[2, 5]),
null_classes = as.numeric(fit1[2, 2]),
alt_ll = as.numeric(fit1[4, 3]),
alt_param = as.numeric(fit1[4, 5]),
alt_classes = as.numeric(fit1[4, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 747.877, LMR LR (df = 28) = 733.261, p < 0.001
## 3-class vs 4-class
calc_lrt(
n = as.numeric(fit1[3, 4]),
null_ll = as.numeric(fit1[3, 3]),
null_param = as.numeric(fit1[3, 5]),
null_classes = as.numeric(fit1[3, 2]),
alt_ll = as.numeric(fit1[4, 3]),
alt_param = as.numeric(fit1[4, 5]),
alt_classes = as.numeric(fit1[4, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 217.328, LMR LR (df = 14) = 208.996, p < 0.001
# Visualize item probability profiles
plot_prob(lca1[[3]]) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 7
))

plot_prob(lca1[[4]]) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 7
))

# Minimum sample size for 3 classes
fit1[3, 4] * fit1[3, 12]
## [1] 445
# Minimum sample size for 4 classes
fit1[4, 4] * fit1[4, 12]
## [1] 64
- Conduct the second latent class analysis.
random_seed <- sample(1:10000, 1)
cat("Seed used:", random_seed, "\n")
## Seed used: 2463
set.seed(random_seed)
## Refilter dataset to only include appropriate variables
df6_lca2 <- df6 |>
filter(if_any(gmpools_dich:gcqdepress_dich, ~ . == 2)) |>
dplyr::select(autoid | gmpools_dich:gcqdepress_dich) |>
mutate(across(gmpools_dich:gcqdepress_dich, as.ordered))
## lca2 <- tidySEM::mx_lca(data = df6_lca2[2:22],
## classes = 1:4,
## missing = "fiml")
## Save LCA for future knitting
## lca2 <- saveRDS(lca2, file = "lca2_results.rds")
## Run second LCA (code use for knitting)
lca2 <- readRDS("lca2_results.rds")
- Assess latent classes for second LCA.
fit2 <- table_fit(lca2)
fit2
## Name Classes LL n Parameters AIC BIC saBIC Entropy
## 1 1 1 -11354.87 1816 21 22751.74 22867.34 22800.62 1.0000000
## 2 2 2 -10549.84 1816 43 21185.68 21422.37 21285.76 0.8525082
## 3 3 3 -10346.39 1816 65 20822.79 21180.57 20974.07 0.8818748
## 4 4 4 -10115.17 1816 87 20404.34 20883.23 20606.83 0.7868395
## prob_min prob_max n_min n_max np_ratio np_local
## 1 1.0000000 1.0000000 1.00000000 1.0000000 86.47619 86.476190
## 2 0.8752273 0.9790906 0.18116740 0.8188326 42.23256 15.666667
## 3 0.8765232 0.9788917 0.06442731 0.7637665 27.93846 5.571429
## 4 0.8066107 0.9354766 0.01762115 0.5952643 20.87356 1.523810
# Lo-Mendell-Rubin Adjusted Likelihood Ratio Test (LMR-ALRT)
## 1-class vs 2-class
calc_lrt(
n = as.numeric(fit2[1, 4]),
null_ll = as.numeric(fit2[1, 3]),
null_param = as.numeric(fit2[1, 5]),
null_classes = as.numeric(fit2[1, 2]),
alt_ll = as.numeric(fit2[2, 3]),
alt_param = as.numeric(fit2[2, 5]),
alt_classes = as.numeric(fit2[2, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 1610.067, LMR LR (df = 22) = 1541.592, p < 0.001
## 2-class vs 3-class
calc_lrt(
n = as.numeric(fit2[2, 4]),
null_ll = as.numeric(fit2[2, 3]),
null_param = as.numeric(fit2[2, 5]),
null_classes = as.numeric(fit2[2, 2]),
alt_ll = as.numeric(fit2[3, 3]),
alt_param = as.numeric(fit2[3, 5]),
alt_classes = as.numeric(fit2[3, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 406.890, LMR LR (df = 22) = 389.586, p < 0.001
## 3-class vs 4-class
calc_lrt(
n = as.numeric(fit2[3, 4]),
null_ll = as.numeric(fit2[3, 3]),
null_param = as.numeric(fit2[3, 5]),
null_classes = as.numeric(fit2[3, 2]),
alt_ll = as.numeric(fit2[4, 3]),
alt_param = as.numeric(fit2[4, 5]),
alt_classes = as.numeric(fit2[4, 2])
)
## Lo-Mendell-Rubin ad-hoc adjusted likelihood ratio rest:
##
## LR = 462.442, LMR LR (df = 22) = 442.775, p < 0.001
# Visualize item probability profiles
plot_prob(lca2[[3]]) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 7
))

plot_prob(lca2[[4]]) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
size = 7
))

# Minimum sample size for 3 classes
fit2[3, 4] * fit2[3, 12]
## [1] 117
# Minimum sample size for 4 classes
fit2[4, 4] * fit2[4, 12]
## [1] 32
# Get class proportions from the 3-class LCA2 model
probs <- class_prob(lca2[[3]], type = "sum.posterior")
probs
## $sum.posterior
## class count proportion
## 1 class1 317.5183 0.17484485
## 2 class2 120.9421 0.06659809
## 3 class3 1377.5396 0.75855705
- Apply latent class assignments to dataframes.
## Assign latent classes to dataframe
# Extract most likely class assignment for each participant
lca1_classes <- class_prob(lca1[[3]], type = "individual")$individual[, "predicted"]
lca2_classes <- class_prob(lca2[[3]], type = "individual")$individual[, "predicted"]
# Add LCA 1 classes to gamblers-only subset
df6$lca1_class <- lca1_classes
# Add LCA 2 classes to gamblers-only subset
df6_lca2$lca2_class <- lca2_classes
# Then merge back to full dataset by row identifier
df6 <- df6 |>
left_join(df6_lca2 |> dplyr::select(autoid, lca2_class), by = "autoid")
## Verify that it worked
table(df6$lca1_class, useNA = "always")
##
## 1 2 3 <NA>
## 1146 445 2687 0
# Should show counts for classes 1, 2, 3 with no NAs
table(df6$lca2_class, useNA = "always")
##
## 1 2 3 <NA>
## 312 117 1387 2462
# Should show counts for classes 1, 2, 3 with 2462 NAs
- Prepare dataframes for regression modeling.
## Filter out any unnecessary variables
df6_model1 <- df6 |>
dplyr::select(autoid:greekmem | smokmo_dich:lca1_class | mhdays)
## Filter out non-gamblers to only include participants in 2nd LCA
df6_model2 <- df6 |>
filter(!is.na(lca2_class)) |>
dplyr::select(autoid:greekmem |
smokmo_dich:othmo_dich | lca2_class | mhdays)
## Make DVs class "factor"
df6_model1 <- df6_model1 |>
mutate(lca1_class = as.factor(lca1_class))
df6_model2 <- df6_model2 |>
mutate(lca2_class = as.factor(lca2_class))
### 4. Relevel DVs so no gambling is the reference group
df6_model1$lca1_class <- relevel(df6_model1$lca1_class, ref = 3)
df6_model2$lca2_class <- relevel(df6_model2$lca2_class, ref = 3)
- Treat missing data using
MICE and evaluate the
imputations.
plan(multisession, workers = parallel::detectCores() - 1)
# Strip all haven labels across the whole data frame
df6_model1 <- haven::zap_labels(df6_model1)
df6_model1 <- haven::zap_label(df6_model1) # variable labels
df6_model1 <- haven::zap_labels(df6_model1) # value labels
# Strip all haven labels across the whole data frame
df6_model2 <- haven::zap_labels(df6_model2)
df6_model2 <- haven::zap_label(df6_model2) # variable labels
df6_model2 <- haven::zap_labels(df6_model2) # value labels
# Define variables
outcome1 <- "lca1_class"
predictors <- c(
"age",
"ethnicity",
"race",
"gender",
"studentstatus",
"residence",
"greekmem",
"smokmo_dich",
"cigarmo_dich",
"snufmo_dich",
"hookmo_dich",
"ecigmo_dich",
"alcmo_dich",
"marmo_dich",
"cocmo_dich",
"hallumo_dich",
"hermo_dich",
"methmo_dich",
"inhmo_dich",
"rxstmo_dich",
"rxpkmo_dich",
"rxsedmo_dich",
"othmo_dich",
"mhdays"
)
all_vars1 <- c(outcome1, predictors)
# Establish imputation method
meth1 <- make.method(df6_model1[, all_vars1])
meth1[] <- "cart"
# Trimmed predictor matrix
pred1 <- quickpred(df6_model1[, all_vars1], mincor = 0.1, minpuc = 0.1)
# Impute in parallel
imp1 <- futuremice(
df6_model1[, all_vars1],
method = meth1,
predictorMatrix = pred1,
m = 20,
maxit = 10,
parallelseed = 42
)
## Evaluate imputations
plot(imp1)







