rm(list = ls())
set.seed(1234)
library(pacman)
p_load(e1071,poLCA,reshape2,tidyverse,knitr,gtsummary,flextable,
ztable,forcats,missForest,tidyLPA,MASS,nnet,flexmix,poLCAParallel,reshape2,
doParallel,doRNG)
set.seed(123)
n <- 500 # サンプルサイズ
# クラスごとに異なる分布を持つデータを生成
generate_class_data <- function(n, probs) {
data <- matrix(sample(1:5, n * 21, replace = TRUE, prob = probs), nrow = n)
missing_indices <- sample(seq_len(length(data)), size = 0.05 * length(data))
data[missing_indices] <- NA
data <- as.data.frame(data)
return(data)
}
data_class1 <- generate_class_data(n, c(0.05, 0.1, 0.15, 0.1, 0.6))
data_class2 <- generate_class_data(n, c(0.1, 0.15, 0.2, 0.25, 0.3))
data_class3 <- generate_class_data(n, c(0.15, 0.2, 0.5, 0.05, 0.2))
data_class4 <- generate_class_data(n, c(0.2, 0.25, 0.3, 0.15, 0.1))
data_class5 <- generate_class_data(n, c(0.3, 0.3, 0.2, 0.15, 0.05))
# データを結合
data <- rbind(data_class1, data_class2, data_class3, data_class4, data_class5)
# 列名をX1, X2, ...に設定
colnames(data) <- paste0("X", 1:21)
# データをFactor型に変換
data <- data.frame(lapply(data, function(x) as.factor(x)))
# 欠測値の補完
start.time <- Sys.time()
doParallel::registerDoParallel(cores = 4) # set based on number of CPU cores
doRNG::registerDoRNG(seed = 123)
imputed_data <- missForest(data,parallelize = "forests")$ximp
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
## Time difference of 30.22103 secs
# データの確認
head(imputed_data)
imputed_data_long_x <- imputed_data %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(imputed_data_long_x, aes(x = Variable, fill = Value)) +
geom_bar(position = "fill") +
labs(title = "Stacked Bar Plot of X1 to X21", x = "Variable", y = "Proportion", fill = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
imputed_data_v <- data.frame(lapply(imputed_data, function(x) as.factor(ifelse(x == 5, 1, 0))))
colnames(imputed_data_v) <- paste0("V", 1:21)
# 元のX1からX21を保持し、新しいV1からV21を結合
imputed_data <- cbind(imputed_data, imputed_data_v)
# データの確認
head(imputed_data)
# 1の割合を計算
one_proportion <- sapply(imputed_data_v, function(x) mean(as.numeric(as.character(x)) == 1))
# 順にソート
sorted_indices <- order(one_proportion, decreasing = TRUE)
sorted_data <- imputed_data_v[, sorted_indices]
sorted_proportion <- one_proportion[sorted_indices]
# ソートされたデータの確認
sorted_proportion
## V17 V15 V14 V3 V10 V18 V12 V20 V9 V16 V5
## 0.2660 0.2628 0.2592 0.2584 0.2580 0.2556 0.2552 0.2552 0.2544 0.2540 0.2528
## V8 V2 V19 V4 V7 V1 V13 V11 V21 V6
## 0.2528 0.2508 0.2508 0.2504 0.2496 0.2476 0.2464 0.2460 0.2448 0.2444
f <- as.formula(paste("cbind(", paste(colnames(sorted_data), collapse = ", "), ") ~ 1"))
max_classes <- 10
# 並列処理の設定
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
# 適合度指標の保存
logLik_values <- numeric(max_classes)
bic_values <- numeric(max_classes)
sabic_values <- numeric(max_classes)
aic_values <- numeric(max_classes)
caic_values <- numeric(max_classes)
entropy_values <- numeric(max_classes)
vlmr_p_values <- numeric(max_classes)
start.time <- Sys.time()
# モデルを保存するリストを初期化
model_fit <- vector("list", max_classes)
results <- foreach(k = 1:max_classes, .packages = c("poLCA", "tidyLPA")) %dopar% {
model <- poLCA(f, imputed_data, nclass = k, maxiter = 3000, tol = 1e-6, nrep = 30)
logLik <- model$llik
bic <- model$bic
sabic <- (-2 * model$llik) + (log((model$N + 2) / 24) * model$npar)
aic <- model$aic
caic <- (-2 * model$llik) + model$npar * (1 + log(model$N))
entropy <- poLCA.entropy(model)
list(model = model, logLik = logLik, bic = bic, sabic = sabic, aic = aic, caic = caic, entropy = entropy)
}
# 結果を保存
for (k in 1:max_classes) {
model_fit[[k]] <- results[[k]]$model
logLik_values[k] <- results[[k]]$logLik
bic_values[k] <- results[[k]]$bic
sabic_values[k] <- results[[k]]$sabic
aic_values[k] <- results[[k]]$aic
caic_values[k] <- results[[k]]$caic
entropy_values[k] <- results[[k]]$entropy
}
# Vuong-Lo-Mendell-Rubin (VLMR) 検定の実行
for (k in 2:max_classes) {
null_model <- model_fit[[k - 1]]
alt_model <- model_fit[[k]]
n <- null_model$N
null_ll <- null_model$llik
null_param <- null_model$npar
null_classes <- length(null_model$P)
alt_ll <- alt_model$llik
alt_param <- alt_model$npar
alt_classes <- length(alt_model$P)
vlmr_p_values[k] <- calc_lrt(n, null_ll, null_param, null_classes, alt_ll, alt_param, alt_classes)[[4]]
}
# 適合度指標の表を作成
fit_indices <- data.frame(
Classes = 1:max_classes,
LogLikelihood = logLik_values,
BIC = bic_values,
SABIC = sabic_values,
AIC = aic_values,
CAIC = caic_values,
Entropy = entropy_values,
VLMR_p = c(NA, vlmr_p_values[-1])
)
# 並列処理の終了
stopCluster(cl)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
fit_indices
fit_indices_melted <- melt(fit_indices, id.vars = "Classes", measure.vars = c("BIC", "SABIC", "AIC", "CAIC"))
ggplot(fit_indices_melted, aes(x = Classes, y = value, color = variable)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:max_classes) +
labs(title = "Elbow Plot for Model Fit Indices", x = "Number of Classes", y = "Value", color = "Fit Index") +
theme_minimal()
# クラスごとのデモグラフィック特徴の解析(仮の変数を追加)
imputed_data$age <- sample(75:95, n , replace = TRUE)
imputed_data$gender <- sample(c("Male", "Female"), n , replace = TRUE)
imputed_data$income <- sample(c("Low", "High"), n , replace = TRUE)
imputed_data$education <- sample(c("Less than college", "College or more"), n, replace = TRUE)
imputed_data$QOL <- sample(1:5, n, replace = TRUE)
# クラスごとの集計
imputed_data$predclass <- model_fit[[which.min(sabic_values)]]$predclass
# クラスごとのテーブルを作成
demo_table <- imputed_data %>%
select(predclass, age, gender, income, education, QOL) %>%
tbl_summary(by = predclass, missing = "no") %>%
as_flex_table() %>%
flextable::fontsize(size = 9, part = "all") %>%
theme_zebra() %>%
autofit() %>%
set_table_properties(width = 1, layout = "autofit", opts_pdf = list(tabcolsep = 3))
# テーブルの表示
demo_table
Characteristic | 1, N = 5311 | 2, N = 1,2011 | 3, N = 7681 |
---|---|---|---|
age | 85 (80, 90) | 85 (80, 91) | 85 (80, 90) |
gender | |||
Female | 262 (49%) | 563 (47%) | 368 (48%) |
Male | 269 (51%) | 638 (53%) | 400 (52%) |
income | |||
High | 272 (51%) | 607 (51%) | 384 (50%) |
Low | 259 (49%) | 594 (49%) | 384 (50%) |
education | |||
College or more | 257 (48%) | 593 (49%) | 380 (49%) |
Less than college | 274 (52%) | 608 (51%) | 388 (51%) |
QOL | |||
1 | 101 (19%) | 214 (18%) | 159 (21%) |
2 | 117 (22%) | 247 (21%) | 141 (18%) |
3 | 102 (19%) | 247 (21%) | 183 (24%) |
4 | 110 (21%) | 251 (21%) | 144 (19%) |
5 | 101 (19%) | 242 (20%) | 141 (18%) |
1Median (IQR); n (%) |
# sorted_data から demographic 変数を除去して long format に変換
imputed_data_long <- imputed_data %>%
select(predclass, starts_with("X")) %>%
pivot_longer(cols = -predclass, names_to = "Variable", values_to = "Value")
ggplot(imputed_data_long, aes(x = Variable, fill = Value)) +
geom_bar(position = "fill") +
facet_wrap(~ predclass, scales = "free") +
labs(title = "Stacked Bar Plot of X1 to X21 by Predclass", x = "Variable", y = "Proportion", fill = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))