ライブラリのロード

rm(list = ls())
set.seed(1234)
library(pacman)
p_load(e1071,poLCA,reshape2,tidyverse,knitr,gtsummary,flextable,
       ztable,forcats,missForest,tidyLPA,MASS,nnet,flexmix,poLCAParallel,reshape2,
       doParallel,doRNG)

データの生成

set.seed(123)
n <- 500  # サンプルサイズ


# クラスごとに異なる分布を持つデータを生成
generate_class_data <- function(n, probs) {
  data <- matrix(sample(1:5, n * 21, replace = TRUE, prob = probs), nrow = n)
  missing_indices <- sample(seq_len(length(data)), size = 0.05 * length(data))
  data[missing_indices] <- NA
  data <- as.data.frame(data)
  return(data)
}

data_class1 <- generate_class_data(n, c(0.05, 0.1, 0.15, 0.1, 0.6))
data_class2 <- generate_class_data(n, c(0.1, 0.15, 0.2, 0.25, 0.3))
data_class3 <- generate_class_data(n, c(0.15, 0.2, 0.5, 0.05, 0.2))
data_class4 <- generate_class_data(n, c(0.2, 0.25, 0.3, 0.15, 0.1))
data_class5 <- generate_class_data(n, c(0.3, 0.3, 0.2, 0.15, 0.05))

# データを結合
data <- rbind(data_class1, data_class2, data_class3, data_class4, data_class5)
# 列名をX1, X2, ...に設定
colnames(data) <- paste0("X", 1:21)

# データをFactor型に変換
data <- data.frame(lapply(data, function(x) as.factor(x)))


# 欠測値の補完
start.time <- Sys.time()
doParallel::registerDoParallel(cores = 4) # set based on number of CPU cores
doRNG::registerDoRNG(seed = 123)
imputed_data <- missForest(data,parallelize = "forests")$ximp
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
## Time difference of 30.22103 secs
# データの確認
head(imputed_data)

積み上げ棒グラフ

imputed_data_long_x <- imputed_data %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

ggplot(imputed_data_long_x, aes(x = Variable, fill = Value)) +
  geom_bar(position = "fill") +
  labs(title = "Stacked Bar Plot of X1 to X21", x = "Variable", y = "Proportion", fill = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

5段階リッカートを0-1に変換し、変数名をV1からV21を追加

imputed_data_v <- data.frame(lapply(imputed_data, function(x) as.factor(ifelse(x == 5, 1, 0))))
colnames(imputed_data_v) <- paste0("V", 1:21)

# 元のX1からX21を保持し、新しいV1からV21を結合
imputed_data <- cbind(imputed_data, imputed_data_v)

# データの確認
head(imputed_data)

V1からV21の項目を1の割合が高い順に示す

# 1の割合を計算
one_proportion <- sapply(imputed_data_v, function(x) mean(as.numeric(as.character(x)) == 1))

# 順にソート
sorted_indices <- order(one_proportion, decreasing = TRUE)
sorted_data <- imputed_data_v[, sorted_indices]
sorted_proportion <- one_proportion[sorted_indices]

# ソートされたデータの確認
sorted_proportion
##    V17    V15    V14     V3    V10    V18    V12    V20     V9    V16     V5 
## 0.2660 0.2628 0.2592 0.2584 0.2580 0.2556 0.2552 0.2552 0.2544 0.2540 0.2528 
##     V8     V2    V19     V4     V7     V1    V13    V11    V21     V6 
## 0.2528 0.2508 0.2508 0.2504 0.2496 0.2476 0.2464 0.2460 0.2448 0.2444

潜在クラス分析の実行と適合度指標の保存  

f <- as.formula(paste("cbind(", paste(colnames(sorted_data), collapse = ", "), ") ~ 1"))
max_classes <- 10

# 並列処理の設定
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)

# 適合度指標の保存
logLik_values <- numeric(max_classes)
bic_values <- numeric(max_classes)
sabic_values <- numeric(max_classes)
aic_values <- numeric(max_classes)
caic_values <- numeric(max_classes)
entropy_values <- numeric(max_classes)
vlmr_p_values <- numeric(max_classes)
start.time <- Sys.time()

# モデルを保存するリストを初期化
model_fit <- vector("list", max_classes)

results <- foreach(k = 1:max_classes, .packages = c("poLCA", "tidyLPA")) %dopar% {
  model <- poLCA(f, imputed_data, nclass = k, maxiter = 3000, tol = 1e-6, nrep = 30)
  logLik <- model$llik
  bic <- model$bic
  sabic <- (-2 * model$llik) + (log((model$N + 2) / 24) * model$npar)
  aic <- model$aic
  caic <- (-2 * model$llik) + model$npar * (1 + log(model$N))
  entropy <- poLCA.entropy(model)
  list(model = model, logLik = logLik, bic = bic, sabic = sabic, aic = aic, caic = caic, entropy = entropy)
}

# 結果を保存
for (k in 1:max_classes) {
  model_fit[[k]] <- results[[k]]$model
  logLik_values[k] <- results[[k]]$logLik
  bic_values[k] <- results[[k]]$bic
  sabic_values[k] <- results[[k]]$sabic
  aic_values[k] <- results[[k]]$aic
  caic_values[k] <- results[[k]]$caic
  entropy_values[k] <- results[[k]]$entropy
}

# Vuong-Lo-Mendell-Rubin (VLMR) 検定の実行
for (k in 2:max_classes) {
  null_model <- model_fit[[k - 1]]
  alt_model <- model_fit[[k]]
  n <- null_model$N
  null_ll <- null_model$llik
  null_param <- null_model$npar
  null_classes <- length(null_model$P)
  alt_ll <- alt_model$llik
  alt_param <- alt_model$npar
  alt_classes <- length(alt_model$P)
  vlmr_p_values[k] <- calc_lrt(n, null_ll, null_param, null_classes, alt_ll, alt_param, alt_classes)[[4]]
}

# 適合度指標の表を作成
fit_indices <- data.frame(
  Classes = 1:max_classes,
  LogLikelihood = logLik_values,
  BIC = bic_values,
  SABIC = sabic_values,
  AIC = aic_values,
  CAIC = caic_values,
  Entropy = entropy_values,
  VLMR_p = c(NA, vlmr_p_values[-1])
)

# 並列処理の終了
stopCluster(cl)


end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
fit_indices

エルボープロットの作成

fit_indices_melted <- melt(fit_indices, id.vars = "Classes", measure.vars = c("BIC", "SABIC", "AIC", "CAIC"))

ggplot(fit_indices_melted, aes(x = Classes, y = value, color = variable)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:max_classes) +
  labs(title = "Elbow Plot for Model Fit Indices", x = "Number of Classes", y = "Value", color = "Fit Index") +
  theme_minimal()

クラスごとのデモグラフィック特徴の解析

# クラスごとのデモグラフィック特徴の解析(仮の変数を追加)
imputed_data$age <- sample(75:95, n , replace = TRUE)
imputed_data$gender <- sample(c("Male", "Female"), n , replace = TRUE)
imputed_data$income <- sample(c("Low", "High"), n , replace = TRUE)
imputed_data$education <- sample(c("Less than college", "College or more"), n, replace = TRUE)
imputed_data$QOL <- sample(1:5, n, replace = TRUE)

# クラスごとの集計
imputed_data$predclass <- model_fit[[which.min(sabic_values)]]$predclass

# クラスごとのテーブルを作成
demo_table <- imputed_data %>%
  select(predclass, age, gender, income, education, QOL) %>%
  tbl_summary(by = predclass, missing = "no") %>%
  as_flex_table() %>%
  flextable::fontsize(size = 9, part = "all") %>%
  theme_zebra() %>%
  autofit() %>%
  set_table_properties(width = 1, layout = "autofit", opts_pdf = list(tabcolsep = 3))

# テーブルの表示
demo_table

Characteristic

1, N = 5311

2, N = 1,2011

3, N = 7681

age

85 (80, 90)

85 (80, 91)

85 (80, 90)

gender

Female

262 (49%)

563 (47%)

368 (48%)

Male

269 (51%)

638 (53%)

400 (52%)

income

High

272 (51%)

607 (51%)

384 (50%)

Low

259 (49%)

594 (49%)

384 (50%)

education

College or more

257 (48%)

593 (49%)

380 (49%)

Less than college

274 (52%)

608 (51%)

388 (51%)

QOL

1

101 (19%)

214 (18%)

159 (21%)

2

117 (22%)

247 (21%)

141 (18%)

3

102 (19%)

247 (21%)

183 (24%)

4

110 (21%)

251 (21%)

144 (19%)

5

101 (19%)

242 (20%)

141 (18%)

1Median (IQR); n (%)

# sorted_data から demographic 変数を除去して long format に変換
imputed_data_long <- imputed_data %>%
  select(predclass, starts_with("X")) %>%
  pivot_longer(cols = -predclass, names_to = "Variable", values_to = "Value")

ggplot(imputed_data_long, aes(x = Variable, fill = Value)) +
  geom_bar(position = "fill") +
  facet_wrap(~ predclass, scales = "free") +
  labs(title = "Stacked Bar Plot of X1 to X21 by Predclass", x = "Variable", y = "Proportion", fill = "Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))