library(lavaan)
library(earth)
nBootstrap <- 1000
# specify population model
population.model <- ' f1 =~ x1 + 0.8*x2 + 1.2*x3
f2 =~ x4 + 0.5*x5 + 1.5*x6
f3 =~ x7 + 0.1*x8 + 0.9*x9
f1 ~~ 0.3*f2
f1 ~~ 0.4*f3
f2 ~~ 0.5*f3
f1 ~ 0.3*f2 + 0.6*f3
'
myModel <- ' f1 =~ x1 + x2 + x3
f2 =~ x4 + x5 + x6
f3 =~ x7 + x8 + x9
f1 ~ f2 + f3
'
# create random data
createData_SEM <- function(nBootstrap = nBootstrap) {
n = 200
set.seed(90210)
dataOrig <- simulateData(population.model, sample.nobs = n)
dataB <- vector("list", length = nBootstrap)
#create non-parametric bootstrap
for (i in 1:nBootstrap) {
boot.idx <- sample(n, replace = TRUE)
dataB[[i]] <- dataOrig[boot.idx, ]
}
list(dataOrig = dataOrig, dataB = dataB)
}
ALL <- createData_SEM( nBootstrap = nBootstrap )
original <- numeric(nBootstrap)
F1 <- numeric(nBootstrap)
F2 <- numeric(nBootstrap)
fit.org <- sem(myModel, data = ALL$dataOrig)
#fit.org <- sam(myModel, data = ALL$dataOrig)
original.optimum <- parametertable(fit.org)
theta.hat <- matrix(NA, nBootstrap, length(original.optimum$est))
p <- 9 #ncol(vcov(fit.org))#9
fp_len <- p * (p + 1) / 2
fp <- matrix(NA, nBootstrap, fp_len)
ERROR <- vector("numeric", nBootstrap)
conv <- vector("numeric", nBootstrap)
for (b in 1:nBootstrap) {
cat("bootstrap =", b)
# original
out <- sem(myModel, se = "none",
data = ALL$dataB[[b]],
start = original.optimum,
control = list(optim.method = "NLMINB", rel.tol = 1e-6))
original[b] <- out@Fit@iterations
parTab <- parametertable(out)
theta.hat[b,] <- parTab$est
VCOV.x <- cov(ALL$dataB[[b]])
fp[b, ] <- VCOV.x[upper.tri(VCOV.x, diag = TRUE)]
train.theta.hat <- theta.hat[1:(b-1L), ,drop = FALSE]
train.fp <- fp[1:(b-1L), ,drop = FALSE]
if (b %% 1000 == 0) {
# learn relationship between train.fp and train.theta.hat
fit.ml <- earth::earth(x = train.fp, y = train.theta.hat, degree = 2)
theta.pred <- predict(fit.ml,
newdata = data.frame(train.fp = fp[b, ,drop = FALSE]))
starting.values <- theta.pred
starting.values[!is.na(parTab$ustart)] <- parTab$ustart[!is.na(parTab$ustart)]
parTab$est <- c(starting.values)
out <- sem(myModel, se = "none",
data = ALL$dataB[[b]],
start = parTab,
control = list(optim.method = "NLMINB", rel.tol = 1e-6)
)
conv[b] <- out@Fit@converged
cat("Converged = ", conv[b])
ERROR[b] <- sqrt(mean(c(theta.pred - parametertable(out)$est)^2))
F2[b] <- out@Fit@iterations
} else {
F2[b] <- original[b]
}
cat(" ...Iterations = ", F2[b], "\n")
}
plot(y = ERROR[seq(0, nBootstrap, by = 100)],
x = seq(1, nBootstrap, by = 100),
type = "l",
main = "latent variables",
ylab = "RMSE")
RESULT <- parametertable(out)
RESULT$theta.pred <- c(theta.pred)
RESULTOverzicht Onderwerpen Machine Learning SEM
Fingerprint Resampling
Fingerprint resampling is een methode om het resamplen aanzienlijk te versnellen door eerdere berekeningen slim te hergebruiken. In plaats van elke keer het volledige model opnieuw te optimaliseren, proberen we uit eerder geobserveerde relaties te leren hoe de output (modelparameters) verandert bij verschillende inputs (resamplede datasets).
Wat is het doel?
Het doel is om de resamplingprocedure bij computationeel zware modellen — zoals SEM of mixed models — sterk te versnellen of de optimalisatie stap zelfs volledig over te slaan. Dit wordt gedaan door gebruik te maken van een voorspelling van de parameteroptima op basis van samenvattende kenmerken (“fingerprints”) van de resamplede data, zoals de scorefunctie (gradiënt), de Hessiaan, of andere sample statistieken.
Wat hebben we gedaan?
In lijn met Mestdagh et al. (2015), hebben we onderzocht of er een voorspelbare relatie bestaat tussen verschillende inputs: we hebben gekeken naar de variantie-covariantiematrix van de data, de model-implied covariantie-matrix, en de sample-statistieken in combinatie met gradiënten en Hessianen.
We hebben gekeken of deze relatie (mogelijk niet-lineair) geleerd kan worden, zodat deze voorspelling kan dienen als startwaarden voor optimalisatie, zodat het aantal iteraties fors gerduceerd kan worden of dat de startwaarden zelfs gelijk worden aan de parameter optima en daarmee het optimalisatie process geheel kan vervangen.
We hebben deze relatie geleerd met behulp van multivariate adaptive regression splines (earth uit het earth-package) en deze vervolgens gebruikt om startwaarden te voorspellen bij bootstrap-schattingen. We hebben gekeken of deze aanpak het aantal iteraties reduceert en of het zelfs de optimalisatiestap volledig kan overnemen. Naast de regressie methode hebben we ook gekeken naar neurale netwerken, maar deze zijn zonder gebruik te kunnen maken van GPU kernen computationeel te belastend.
Bron: Mestdagh, M., Verdonck, S., Duisters, K., & Tuerlinckx, F. (2015). Fingerprint resampling: A generic method for efficient resampling. Scientific Reports, 5, 16970. https://doi.org/10.1038/srep16970
Hieronder een voorbeeld waarbij we de covariantie matrix van de data als input gebruiken:
Ervaringen
- Mars is een goede methode om startwaarden te generen, de RMSE daalt al snel.
- Mars is een van de weinige methoden die overweg kan met multivariate uitkomsten.
- De implementatie van Mars, het earth package in R, wordt al snel traag waardoor het niet praktisch is.
Startwaarden en Iteraties – Simulatiestudie
Door middel van een simulatie hebben we onderzocht wat de relatie is tussen de afstand van de startwaarden tot het ware optimum en het aantal benodigde iteraties tot convergentie in SEM-optimalisatie.
Wat is het doel?
Inzicht verkrijgen in hoe gevoelig de optimizer (hier NLMINB) is voor verstoringen in de startwaarden, en zo beter begrijpen hoe belangrijk goede initiële schattingen zijn in bijvoorbeeld bootstrap of fingerprint resampling.
Wat hebben we gedaan?
We hebben voor verschillende standaarddeviaties sd willekeurige verstoringen toegevoegd aan de ware parameterwaarden (theta_hat) en het model telkens opnieuw geoptimaliseerd. Voor elke combinatie hebben we het aantal iteraties en de afstand tot het optimum geregistreerd. De resultaten worden gevisualiseerd met een scatterplot en loess-curve.
library(lavaan)
pop.syntax <- '
visual =~ 1*x1 + 0.5*x2 + 0.7*x3
textual =~ 1*x4 + 1.1*x5 + 0.9*x6
speed =~ 1*x7 + 1.2*x8 + 1.1*x9
'
syntax <- '
visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
'
set.seed(42)
DATA <- simulateData(pop.syntax, model.type = "cfa", sample.nobs = 200)
fit.lav <- sem(syntax, data = DATA)
theta_hat <- coef(fit.lav)
theta_names <- names(theta_hat)
n_params <- length(theta_hat)
sds <- c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1)
n_reps <- 100
results <- data.frame()
r <- 100
for (sd in sds) {
cat("sd = ", sd, "\n")
if (sd > 0) {
for (r in 1:n_reps) {
cat(" ...step =", r, "\n")
perturbed_start <- theta_hat + rnorm(n_params, mean = 0, sd = sd)
fit <- tryCatch(
sem(syntax, data = DATA, start = perturbed_start,
control = list(optim.method = "NLMINB", rel.tol = 1e-6)),
error = function(e) NULL
)
if (!is.null(fit)) {
n_iter <- fit@Fit@iterations
dist <- sqrt(sum((perturbed_start - theta_hat)^2))
results <- rbind(results, data.frame(sd = sd, rep = r, iterations = n_iter, distance = dist))
} else {
results <- rbind(results, data.frame(sd = sd, rep = r, iterations = NA, distance = NA))
}
}
} else {
fit <- tryCatch(
sem(syntax, data = DATA,
control = list(optim.method = "NLMINB", rel.tol = 1e-6)),
error = function(e) NULL
)
if (!is.null(fit)) {
n_iter <- fit@Fit@iterations
dist <- sqrt(sum((coef(fit) - theta_hat)^2))
results <- rbind(results, data.frame(sd = sd, rep = r, iterations = n_iter, distance = dist))
} else {
results <- rbind(results, data.frame(sd = sd, rep = r, iterations = NA, distance = NA))
}
}
}library(ggplot2)
ggplot(results[results$iterations > 0, ], aes(x = distance, y = iterations, color = as.factor(sd))) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Effect van startwaarde-afstand tot optimum op aantal iteraties",
x = "Euclidische afstand tot theta_hat",
y = "Aantal iteraties",
color = "sd van rnorm") +
theme_minimal() +
geom_hline(yintercept = results$iterations[1], linetype = "dashed", color = "red")`geom_smooth()` using formula = 'y ~ x'
Conclusie
- De NLMINB-optimizer profiteert van goede startwaarden, maar voert alsnog een zoektocht door de parameterruimte uit. Het aantal iteraties ligt daardoor meestal tussen de 10 en 20. Verdere reductie is lastig, tenzij de startwaarden zeer dicht bij het optimum liggen.
Higher-Order Swiss Army Infinitesimal Jackknife (HOIJ)
De HOIJ benadert het effect van herwegingen van de data (zoals in bootstrap) op modelparameters via een hogere-orde Taylorexpansie rond de oorspronkelijke fit. Dit bespaart herhaalde model-fits en levert nauwkeurige schattingen van variabiliteit.
Wat is het doel?
Ons doel was om de HOIJ toe te passen binnen SEM, specifiek om:
De efficiëntie van bootstrapprocedures te verhogen;
Snelle, maar nauwkeurige variantieschattingen te verkrijgen;
Betere startwaarden te genereren;
Het aantal iteraties van het optimalisatieproces te reduceren.
Wat hebben we gedaan?
Gebaseerd op het HOIJ-paper van Giordano et al. (2019), hebben we:
De 1e orde IJ-term berekend;
De 2e orde HOIJ-term toegevoegd waarbij we een versimpelde OPG-estimator (outer-product-of-gradients) van de 2de orde hebben gebruikt;
Deze berekening hebben we zowel via matrixalgebra als volledig gevectoriseerd in Torch geïmplementeerd;
Een adaptive bound toegevoegd, gebaseerd op SE’s en Hessiaan-eigenwaarden.
Bron: Giordano, R., Broderick, T., & Jordan, M. I. (2019). A Higher-Order Swiss Army Infinitesimal Jackknife. arXiv preprint arXiv:1907.12116
Bereken de variantieschatting van de parameters o.b.v. de HOIJ.
# lavaan CFA model
HS.model <- ' visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
visual ~ textual + speed
'
# in werkelijkheid wil je dit veel hoger zetten (zeg 5000)
R <- 500
set.seed(42)
fit <- sem(HS.model, data = HolzingerSwineford1939, se = "none", estimator = "ml")
out <- bootstrapLavaan(fit, R = R, keep.idx = TRUE, verbose = FALSE)
IDX1 <- attr(out, "boot.idx")
Scores <- lavScores(fit, scaling = FALSE)
H.inv <- lavTech(fit, "inverted.information.observed")
theta0 <- coef(fit)
N <- nrow(Scores)
D <- ncol(Scores)
COEF2 <- matrix(NA, R, D)
for (i in 1:R) {
#cat("iteration =", i, "\n")
freq <- numeric(N)
Table <- table(IDX1[[1]][i, ])
freq[as.integer(names(Table))] <- Table[]
wt <- freq / sum(freq)
delta.w <- wt - rep(1, N)
# 1e orde
G <- colSums(Scores * delta.w)
theta_1st <- theta0 - H.inv %*% G
# 2e orde
G2 <- matrix(0, D, D)
for (j in 1:N) {
s <- Scores[j, , drop = FALSE]
G2 <- G2 + delta.w[j]^2 * (t(s) %*% s) # OPG-estimator
}
G2 <- G2 / N
theta_hoij <- drop(theta0 - H.inv %*% G + 0.5 * diag(H.inv %*% G2 %*% H.inv))
COEF2[i, ] <- theta_hoij
}
colnames(COEF2) <- names(theta0)
round(apply(out, 2, var), 3) visual=~x2 visual=~x3 textual=~x5 textual=~x6
0.018 0.019 0.005 0.004
speed=~x8 speed=~x9 visual~textual visual~speed
0.020 0.138 0.008 0.075
x1~~x1 x2~~x2 x3~~x3 x4~~x4
0.026 0.014 0.010 0.003
x5~~x5 x6~~x6 x7~~x7 x8~~x8
0.003 0.002 0.009 0.014
x9~~x9 visual~~visual textual~~textual speed~~speed
0.017 0.019 0.016 0.011
textual~~speed
0.003
round(apply(COEF2, 2, var), 3) visual=~x2 visual=~x3 textual=~x5 textual=~x6
0.016 0.018 0.005 0.003
speed=~x8 speed=~x9 visual~textual visual~speed
0.016 0.075 0.008 0.055
x1~~x1 x2~~x2 x3~~x3 x4~~x4
0.024 0.014 0.010 0.003
x5~~x5 x6~~x6 x7~~x7 x8~~x8
0.003 0.002 0.009 0.015
x9~~x9 visual~~visual textual~~textual speed~~speed
0.015 0.017 0.017 0.012
textual~~speed
0.003
Ervaringen
- Elegante methode om varianties te schatten.
- Voor nu enkel de OPG-estimator ingebouwd.
Startwaarden en Iteraties
Betere startwaarden kunnen leiden tot minder iteraties, vlotte convergentie of minder mislukte fits.
Wat is het doel?
We willen betere startwaarden genereren die dichter bij de uiteindelijke oplossing liggen, zodat het aantal iteraties afneemt en de kans op convergentie toeneemt. Hiervoor benutten we structurele informatie uit het model zelf.
Wat hebben we gedaan?
We hebben HOIJ-benaderde parameters gebruikt als startwaarden:
De implementatie van een functie
theta_hoij()die zowel eerste-orde (IJ) als tweede-orde (HOIJ) correcties toepast;De toepassing van deze schatter op bootstrap-samples, waarbij de invloed van herwogen scores wordt gecorrigeerd met een geschaalde outer-product-of-gradients (OPG);
De toevoeging van bounds om parameters tijdens de optimalisatie binnen plausibele marges te houden;
Een experiment waarin we 30 bootstrap-fits uitvoerden met drie scenario’s:
- Lavaan met standaard startwaarden;
- HOIJ-gegenereerde startwaarden;
- HOIJ + bounds.
De resultaten tonen een duidelijke reductie in het gemiddeld aantal iteraties in de HOIJ-condities.
library(lavaan)
# ======================== HOIJ-functie ========================
theta_hoij <- function(fit, idx) {
S <- lavScores(fit, scaling = TRUE)
H <- inspect(fit, "information.observed")
theta0 <- coef(fit)
N <- nrow(S)
p <- length(theta0)
wt <- table(factor(idx, levels = 1:N))
delta_w <- as.numeric(wt) - 1
H_inv <- lavTech(fit, "inverted.information.observed")
# Eerste orde term
G <- colSums(S * delta_w)
theta_total <- drop(theta0 - H_inv %*% G)
# Tweede orde OPG-correctie
G2 <- matrix(0, p, p)
for (j in 1:N) {
sj <- S[j, , drop = FALSE]
G2 <- G2 + (delta_w[j]^2) * t(sj) %*% sj
}
G2 <- G2 / N
M <- H_inv %*% G2 %*% H_inv
second_term <- 0.5 * drop(M %*% rep(1, p))
theta_total <- theta_total + second_term
return(as.numeric(theta_total))
}
# ======================== Main bootstrap loop ========================
set.seed(42)
pop.syntax <- '
visual =~ 1*x1 + 0.5*x2 + 0.7*x3
textual =~ 1*x4 + 1.1*x5 + 0.9*x6
speed =~ 1*x7 + 1.2*x8 + 1.1*x9
visual ~ 0.3*textual + 0.8*speed
'
syntax <- '
visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
visual ~ textual + speed
'
set.seed(42)
DATA <- simulateData(pop.syntax, model.type = "cfa", sample.nobs = 500)
fit.lav <- sem(syntax, data = DATA)
N <- nrow(DATA)
R <- 1000
IDX1 <- replicate(R, sample(N, replace = TRUE), simplify = FALSE)
Scores <- lavScores(fit.lav, scaling = FALSE)
H.inv <- lavTech(fit.lav, "inverted.information.observed")
theta0 <- coef(fit.lav)
N <- nrow(Scores)
D <- ncol(Scores)
COEF <- matrix(NA, R, D)
for (i in 1:R) {
#cat("iteration =", i, "\n")
freq <- numeric(N)
Table <- table(IDX1[[i]])
freq[as.integer(names(Table))] <- Table[]
wt <- freq / sum(freq)
delta.w <- wt - rep(1, N)
# 1e orde
G <- colSums(Scores * delta.w)
theta_1st <- theta0 - H.inv %*% G
# 2e orde
G2 <- matrix(0, D, D)
for (j in 1:N) {
s <- Scores[j, , drop = FALSE]
G2 <- G2 + delta.w[j]^2 * (t(s) %*% s)
}
G2 <- G2 / N
theta_hat_hoij <- drop(theta0 - H.inv %*% G + 0.5 * diag(H.inv %*% G2 %*% H.inv))
COEF[i, ] <- theta_hat_hoij
}
colnames(COEF) <- names(theta0)
#####
B <- 30
N <- nrow(DATA)
P <- length(theta0)
F_default <- F_hoij <- F_bounds <- numeric(B)
conv_default <- conv_hoij <- conv_bounds <- numeric(B)
nbounds <- numeric(B)
PT <- parametertable(fit.lav)
PT$est <- NULL
PT$se <- NULL
PT$start <- NULL
H <- inspect(fit.lav, "information.observed")
eigvals <- eigen(H)$values
bound_scale <- pmin(2, pmax(1, 1.5 / sqrt(abs(eigvals))))
se <- round(apply(COEF, 2, var, na.rm = TRUE), 3)
for (b in 1:B) {
#cat("b =", b)
idx <- sample(1:N, replace = TRUE)
# --- Lavaan default fit ---
fit_default <- cfa(syntax, data = DATA[idx, ],
control = list(optim.method = "NLMINB", rel.tol = 1e-6))
# --- HOIJ start fit ---
hoij_start <- theta_hoij(fit.lav, idx)
fit_hoij <- cfa(syntax, data = DATA[idx, ], start = hoij_start,
control = list(optim.method = "NLMINB", rel.tol = 1e-6))
# --- HOIJ start fit met lower en upper bounds ---
PT$lower[PT$free > 0] <- hoij_start - bound_scale * sqrt(se)
PT$upper[PT$free > 0] <- hoij_start + bound_scale * sqrt(se)
fit_bounds <- cfa(PT, data = DATA[idx, ], start = hoij_start,
control = list(optim.method = "NLMINB", rel.tol = 1e-6))
F_default[b] <- fit_default@Fit@iterations
F_hoij[b] <- fit_hoij@Fit@iterations
F_bounds[b] <- fit_bounds@Fit@iterations
conv_default[b] <- fit_default@Fit@converged
conv_hoij[b] <- fit_hoij@Fit@converged
conv_bounds[b] <- fit_bounds@Fit@converged
nbounds[b] <- sum(is.na(fit_bounds@Fit@se))
cat(" Iteraties default =", F_default[b],
" HOIJ =", F_hoij[b],
" HOIJ+Bounds =", F_bounds[b], "\n")
}
# ======================== Evaluatie ========================
cat("\nMediaan aantal iteraties:\n")
cat("Default: ", median(F_default), "\n")
cat("HOIJ: ", median(F_hoij), "\n")
cat("HOIJ + Bounds: ", median(F_bounds), "\n")
cat("\nVariantie aantal iteraties:\n")
cat("Default: ", var(F_default), "\n")
cat("HOIJ: ", var(F_hoij), "\n")
cat("HOIJ + Bounds: ", var(F_bounds), "\n")
cat("\nAantal convergente fits:\n")
cat("Default: ", sum(conv_default), "\n")
cat("HOIJ: ", sum(conv_hoij), "\n")
cat("HOIJ + Bounds: ", sum(conv_bounds), "\n")
cat("\nAantal boundary issues:\n")
cat(sum(nbounds > 0), " van ", B, "\n")TensorFlow SEM met Computation Graphs
SEM-modellen kunnen zelf opgebouwd worden in TensorFlow. Hierdoor kunnen we volledig zelf bepalen hoe de lossfunctie eruitziet, welke optimalisatie-algoritmen we gebruiken en hoe we omgaan met gradiënten of constraints.
Wat is het doel?
In het artikel wordt beschreven hoe we SEM modellen schaalbaarder, flexibeler en uitbreidbaarder kunnen maken. Denk aan het eenvoudig toevoegen van regularisatie (zoals LASSO), robustere lossfuncties (zoals LAD), of combinaties met deep learning (zoals auto-encoders). Dit is vooral nuttig bij complexe of grootschalige modellen waarvoor traditionele SEM-software te rigide of traag is.
Bron: Van Kesteren, E.J., & Oberski, D.L. (2019). Structural Equation Modeling using Computation Graphs. doi:10.31234/osf.io/qn2jx
Zie hier het R package tensorsem waarbij gebruik wordt gemaakt van Torch
Bootrapped Vs. HOIJ standaardfouten
simulate_non_normal_data <- function(data,
n = 500,
skew_vals = NULL,
kurt_vals = NULL,
outlier_frac = 0,
outlier_sd = 5,
seed = NULL) {
if (!is.null(seed)) set.seed(seed)
observed_vars <- names(data)
p <- length(observed_vars)
# Resample uit bestaande data
dat <- data[sample(nrow(data), size = n, replace = TRUE), observed_vars]
# Voeg outliers toe indien gevraagd
if (outlier_frac > 0) {
n_out <- ceiling(n * outlier_frac)
outlier_idx <- sample(seq_len(n), size = n_out)
for (v in observed_vars) {
s <- sd(dat[[v]], na.rm = TRUE)
if (!is.finite(s) || s == 0) next
dat[outlier_idx, v] <- dat[outlier_idx, v] + rnorm(n_out, mean = 0, sd = outlier_sd * s)
}
}
return(as.data.frame(dat))
}
run_se_simulation <- function(model, PT,
nsim = 200, R = 200, N = 500,
skew_vals = NULL, kurt_vals = NULL,
outlier_frac = 0.05, outlier_sd = 2,
seed = NULL) {
set.seed(seed)
observed_vars <- lavNames(model, type = "ov")
D <- 21
theta_mat <- matrix(NA, nsim, D)
SE_boot_mat <- matrix(NA, nsim, D)
SE_hoij_mat <- matrix(NA, nsim, D)
for (m in 1:nsim) {
if (m %% 10 == 0) cat("Replicatie", m, "van", nsim, "\n")
# dat <- simulate_non_normal_data(
# PT = PT,
# n = N,
# skew_vals = skew_vals,
# kurt_vals = kurt_vals,
# outlier_frac = outlier_frac,
# outlier_sd = outlier_sd
# )
dat <- simulate_non_normal_data(
data = HolzingerSwineford1939,
n = N,
skew_vals = skew_vals,
kurt_vals = kurt_vals,
outlier_frac = outlier_frac,
outlier_sd = outlier_sd
)
fit <- try(sem(model, data = dat, se = "none"), silent = TRUE)
if (inherits(fit, "try-error")) next
theta0 <- coef(fit)
theta_mat[m, ] <- theta0
# Bootstrap
out_boot <- try(bootstrapLavaan(fit, R = R, keep.idx = TRUE, verbose = FALSE,
ncpus = 14, parallel = "snow"), silent = TRUE)
if (inherits(out_boot, "try-error")) next
SE_boot_mat[m, ] <- apply(out_boot, 2, sd)
# HOIJ
IDX1 <- attr(out_boot, "boot.idx")
Scores <- lavScores(fit, scaling = FALSE)
H.inv <- lavTech(fit, "inverted.information.observed")
COEF2 <- matrix(NA, R, D)
for (r in 1:R) {
freq <- numeric(N)
Table <- table(IDX1[[1]][r, ])
freq[as.integer(names(Table))] <- Table[]
wt <- freq / sum(freq)
delta.w <- wt - rep(1, N)
G <- colSums(Scores * delta.w)
G2 <- matrix(0, D, D)
for (j in 1:N) {
s <- Scores[j, , drop = FALSE]
G2 <- G2 + delta.w[j]^2 * (t(s) %*% s)
}
G2 <- G2 / N
theta_hoij <- drop(theta0 - H.inv %*% G + 0.5 * diag(H.inv %*% G2 %*% H.inv))
COEF2[r, ] <- theta_hoij
}
SE_hoij_mat[m, ] <- apply(COEF2, 2, sd)
}
# Empirische SE
true_SE <- apply(theta_mat, 2, sd, na.rm = TRUE)
# bias en RMSE
se_eval <- function(est_mat) {
se_est <- colMeans(est_mat, na.rm = TRUE)
bias <- se_est - true_SE
rmse <- sqrt(colMeans((est_mat - matrix(true_SE, nrow = nsim, ncol = D,
byrow = TRUE))^2, na.rm = TRUE))
data.frame(SE = se_est, Bias = bias, RMSE = rmse)
}
out <- list(
true_SE = true_SE,
bootstrap = se_eval(SE_boot_mat),
hoij = se_eval(SE_hoij_mat)
)
return(out)
}
HS.model <- ' visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
visual ~ textual + speed '
fit <- sem(HS.model, data = HolzingerSwineford1939, se = "none")
PT <- parTable(fit)
#nrow(HolzingerSwineford1939)
result <- run_se_simulation(
model = HS.model,
PT = PT,
nsim = 200, R = 200,
N = 301,
skew_vals = rep(0, 9),
kurt_vals = rep(0, 9),
outlier_frac = 0,
outlier_sd = 2
)
# resultaten
RESULT <- round(data.frame(
true_SE = result$true_SE,
boot_SE = result$bootstrap$SE,
HOIJ_SE = result$hoij$SE,
boot_bias = result$bootstrap$Bias,
HOIJ_bias = result$hoij$Bias,
boot_RMSE = result$bootstrap$RMSE,
HOIJ_RMSE = result$hoij$RMSE
), 3)
rownames(RESULT) <- names(coef(fit))
RESULT
> RESULT
true_SE boot_SE HOIJ_SE boot_bias HOIJ_bias boot_RMSE HOIJ_RMSE
visual=~x2 0.144 0.153 0.146 0.009 0.002 0.051 0.054
visual=~x3 0.155 0.179 0.161 0.024 0.007 0.092 0.066
textual=~x5 0.066 0.067 0.066 0.001 -0.001 0.010 0.009
textual=~x6 0.063 0.063 0.061 0.000 -0.001 0.008 0.008
speed=~x8 0.159 0.176 0.151 0.017 -0.008 0.065 0.048
speed=~x9 0.398 0.480 0.381 0.083 -0.017 0.333 0.429
visual~textual 0.088 0.092 0.095 0.004 0.007 0.016 0.030
visual~speed 0.260 0.300 0.281 0.040 0.020 0.132 0.260
x1~~x1 0.178 0.183 0.179 0.005 0.001 0.069 0.060
x2~~x2 0.120 0.112 0.116 -0.008 -0.004 0.015 0.017
x3~~x3 0.100 0.109 0.106 0.010 0.007 0.033 0.029
x4~~x4 0.052 0.050 0.050 -0.003 -0.003 0.006 0.006
x5~~x5 0.057 0.056 0.056 0.000 0.000 0.007 0.007
x6~~x6 0.048 0.046 0.046 -0.002 -0.002 0.005 0.005
x7~~x7 0.101 0.100 0.105 -0.001 0.005 0.020 0.062
x8~~x8 0.125 0.125 0.128 0.000 0.003 0.039 0.096
x9~~x9 0.136 0.141 0.136 0.005 0.000 0.056 0.104
visual~~visual 0.155 0.160 0.153 0.005 -0.003 0.068 0.072
textual~~textual 0.123 0.120 0.122 -0.002 -0.001 0.013 0.013
speed~~speed 0.112 0.107 0.113 -0.006 0.000 0.022 0.064
textual~~speed 0.057 0.058 0.059 0.001 0.003 0.009 0.012Conclusie
HOIJ_SE is in de meeste gevallen nauwkeuriger dan boot_SE, zeker voor ladingen en regressie-effecten, doordat het lagere bias en RMSE vertoont.
Bootstrap doet het soms iets beter op residuele varianties, mogelijk doordat deze lastiger te schatten zijn met de gebruikte HOIJ-benadering. De variantie is bijvoorbeeld voor speed=~x9 veel groter voor HOIJ.
Dus hoewel de HOIJ unbiased is, vertoont deze wel grotere variatie per steekproef dan de boostrap.
SNLSS startwaarden (CFA: lambda + beta van het origineel)
library(lavaan)
library(data.table)
library(future.apply)Loading required package: future
source("code snippets/SNLLS/SNLLS_algorithm.R")
n_boot <- 1000
n_workers <- parallel::detectCores(logical = FALSE)
model_cfa <- '
visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
visual ~ textual + speed
'
population.model <- ' visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
visual ~ 0.2*textual + 0.5*speed
'
# generate data
set.seed(3013073)
simData <- simulateData(population.model, sample.nobs = 100L)
# Origineel fit om parameters op te halen
fit.cfa <- sem(model_cfa, data = simData, estimator = "GLS",
do.fit = TRUE, representation = "LISREL")
GLIST <- fit.cfa@Model@GLIST
lambda <- GLIST$lambda
beta <- GLIST$beta
pt_org <- parTable(fit.cfa)
lambda_idx <- which(pt_org$op == "=~")
beta_idx <- which(pt_org$op == "~")
# Set parallel optie
plan(multisession, workers = n_workers)
# Functie per bootstrap replicatie
run_bootstrap_iteration <- function(i) {
set.seed(42 + i)
boot_data <- simData[sample(nrow(simData), replace = TRUE), ]
res <- list()
# Default methode
fit.org <- sem(model = model_cfa, data = boot_data, estimator = "GLS",
do.fit = TRUE, representation = "RAM")
res[[1]] <- data.table(replicatie = i, method = "default",
iterations = fit.org@optim$iterations)
## SNLLS startwaarden
pt <- copy(pt_org)
# fix lambda en beta
pt$free[lambda_idx] <- 0
pt$free[beta_idx] <- 0
pt$ustart[lambda_idx] <- lav_matrix_vec(lambda)[lav_matrix_vec(lambda) > 0]
pt$ustart[beta_idx] <- lav_matrix_vec(beta)[lav_matrix_vec(beta) > 0]
pt$est <- NULL
pt$start <- NULL
pt$se <- NULL
non_free_idx <- setdiff(seq_len(nrow(pt)), c(lambda_idx, beta_idx))
pt$free[non_free_idx] <- seq_along(non_free_idx)
fit.snlls <- cfa(pt, data = boot_data, estimator = "GLS", do.fit = FALSE,
representation = "RAM")
lavmodel <- fit.snlls@Model
S <- fit.snlls@SampleStats@cov[[1]]
S.inv <- fit.snlls@SampleStats@icov[[1]]
A <- lavmodel@GLIST$A
Fo <- diag(nrow(A))[lavmodel@GLIST$ov.idx, , drop = FALSE]
IA.inv <- lavaan:::lav_matrix_inverse_iminus(A)
FIA.inv <- Fo %*% IA.inv
V <- 0.5 * lavaan:::lav_matrix_duplication_pre_post(S.inv %x% S.inv)
s <- lavaan:::lav_matrix_vech(S)
Vs <- V %*% s
L.S <- lavaan:::lav_model_dmmdpar(lavmodel, target = "S")
G <- lavaan:::lav_matrix_duplication_ginv_pre(FIA.inv %x% FIA.inv) %*% L.S
out <- solve(crossprod(G, V) %*% G, crossprod(G, Vs))
# first elements belong to PSI
S <- matrix(L.S %*% out, ncol(A), ncol(A))
# extract variantie en covariantie
pt$ustart[pt$free > 0] <- c(diag(S), S[12, 11])
pt$free <- pt_org$free
#pt$ustart[pt$ustart < 0] <- 3e-01
fit.final <- sem(model = pt, data = boot_data, start = pt,
estimator = "GLS", do.fit = TRUE, representation = "LISREL")
res[[2]] <- data.table(replicatie = i, method = "snlls",
iterations = fit.final@optim$iterations)
rbindlist(res, use.names = TRUE, fill = TRUE)
}
# run
set.seed(42)
results_list <- future_lapply(1:n_boot, run_bootstrap_iteration, future.seed = TRUE)
results <- rbindlist(results_list)
results replicatie method iterations
<int> <char> <int>
1: 1 default 35
2: 1 snlls 32
3: 2 default 35
4: 2 snlls 31
5: 3 default 35
---
1996: 998 snlls 40
1997: 999 default 37
1998: 999 snlls 30
1999: 1000 default 35
2000: 1000 snlls 28
# =======
library(ggplot2)
ggplot(results, aes(x = method, y = iterations)) +
geom_boxplot(fill = "steelblue") +
labs(title = "Aantal iteraties voor convergentie",
y = "Aantal iteraties", x = "Startwaarden methode") +
theme_minimal()summary_stats <- results[, .(
mean_iter = mean(iterations, na.rm = TRUE),
sd_iter = sd(iterations, na.rm = TRUE),
median_iter = median(iterations, na.rm = TRUE),
n = .N
), by = method]
summary_stats method mean_iter sd_iter median_iter n
<char> <num> <num> <num> <int>
1: default 37.901 8.279357 37 1000
2: snlls 42.225 138.305317 33 1000
Conclusie
- Het fixeren van de lambda’s en de beta’s op het optimum van de originele data is wat veel van het goede om goede VCOV startwaarden te genereren voor de bootstrap samples.
Guttman lambda’s + SNLLS VCOV startwaarden - CFA
library(lavaan)
library(data.table)
library(future.apply)
n_boot <- 1000
n_workers <- parallel::detectCores(logical = FALSE)
model_cfa <- '
visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9
'
set.seed(3013073)
sim_data <- simulateData(model = model_cfa, sample.nobs = 500, model.type = "cfa")
# Set parallel optie
plan(multisession, workers = n_workers)
# Functie per bootstrap replicatie
run_bootstrap_iteration <- function(i) {
set.seed(42 + i)
# boot_data <- HolzingerSwineford1939[sample(nrow(HolzingerSwineford1939), replace = TRUE), ]
boot_data <- sim_data[sample(nrow(sim_data), replace = TRUE), ]
res <- list()
# Default methode
fit.cfa <- sem(model_cfa, data = boot_data, estimator = "gls",
do.fit = TRUE, representation = "LISREL"#,
#control = list(optim.method = "NLMINB", rel.tol = 1e-6)
)
res[[1]] <- data.table(replicatie = i, method = "default",
iterations = fit.cfa@optim$iterations)
GLIST <- fit.cfa@Model@GLIST
pt.org <- partable(fit.cfa)
lambda <- GLIST$lambda
lambda_idx <- which(pt.org$op == "=~")
guttman_est <- lavaan:::lav_cfa_guttman1952_internal(fit.cfa)
pt <- copy(pt.org)
pt$start <- NULL
pt$est <- NULL
pt$se <- NULL
na_idx <- which(is.na(pt$ustart[lambda_idx]))
pt$ustart[lambda_idx][na_idx] <- guttman_est[1:6]
# make them non-free est
pt$free[lambda_idx] <- 0L
## SNLLS startwaarden
fit.snlls <- cfa(pt, data = boot_data, estimator = "gls", do.fit = FALSE,
representation = "RAM")
lavmodel <- fit.snlls@Model
S <- fit.snlls@SampleStats@cov[[1]]
S.inv <- fit.snlls@SampleStats@icov[[1]]
A <- lavmodel@GLIST$A
Fo <- diag(nrow(A))[lavmodel@GLIST$ov.idx, , drop = FALSE]
IA.inv <- lavaan:::lav_matrix_inverse_iminus(A)
FIA.inv <- Fo %*% IA.inv
V <- 0.5 * lavaan:::lav_matrix_duplication_pre_post(S.inv %x% S.inv)
s <- lavaan:::lav_matrix_vech(S)
Vs <- V %*% s
L.S <- lavaan:::lav_model_dmmdpar(lavmodel, target = "S")
G <- lavaan:::lav_matrix_duplication_ginv_pre(FIA.inv %x% FIA.inv) %*% L.S
out <- solve(crossprod(G, V) %*% G, crossprod(G, Vs))
S <- matrix(L.S %*% out, ncol(A), ncol(A))
diag_vals <- diag(S)
upper_vals <- S[upper.tri(S)]
upper_vals <- upper_vals[abs(upper_vals) > 0]
out <- c(diag_vals, upper_vals)
pt$ustart[pt$free > 0] <- out
pt$free <- pt.org$free
fit.guttman_snlls <- sem(model = pt, data = boot_data,
estimator = "gls", do.fit = TRUE, representation = "LISREL"#,
#control = list(optim.method = "NLMINB", rel.tol = 1e-6)
)
fit.guttman_snlls@optim
#partable(fit.guttman_snlls)
# Hier gaan we verder met ML
res[[2]] <- data.table(replicatie = i, method = "guttman_snlls",
iterations = fit.guttman_snlls@optim$iterations)
rbindlist(res, use.names = TRUE, fill = TRUE)
}
# run
set.seed(42)
results_list <- future_lapply(1:n_boot, run_bootstrap_iteration, future.seed = TRUE)
results <- rbindlist(results_list)
# =======
# library(ggplot2)
# ggplot(results, aes(x = method, y = iterations)) +
# geom_boxplot(fill = "steelblue") +
# labs(title = "Aantal iteraties voor convergentie",
# y = "Aantal iteraties", x = "Startwaarden methode") +
# theme_minimal()
summary_stats <- results[, .(
mean_iter = mean(iterations, na.rm = TRUE),
sd_iter = sd(iterations, na.rm = TRUE),
median_iter = median(iterations, na.rm = TRUE),
n = .N
), by = method]
summary_stats method mean_iter sd_iter median_iter n
<char> <num> <num> <num> <int>
1: default 33.236 1.977932 33 1000
2: guttman_snlls 26.278 2.103602 26 1000
SNLLS starting values for growth model (lambda’s worden gefixeerd door de user)
library(lavaan)
library(data.table)
library(future.apply)
n_boot <- 1000
n_workers <- parallel::detectCores(logical = FALSE)
model_growth <- '
# intercept and slope with fixed coefficients
i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
s =~ 0*t1 + 1*t2 + 2*t3 + 3*t4
'
#fit <- growth(model_growth, data = Demo.growth)
#partable(fit)
# Set parallel optie
plan(multisession, workers = n_workers)
# Functie per bootstrap replicatie
run_bootstrap_iteration <- function(i) {
set.seed(42 + i)
boot_data <- Demo.growth[sample(nrow(Demo.growth), replace = TRUE), ]
res <- list()
# Default methode
fit.growth <- growth(model_growth, data = boot_data, estimator = "gls",
do.fit = TRUE, representation = "RAM")
res[[1]] <- data.table(replicatie = i, method = "default",
iterations = fit.growth@optim$iterations)
#GLIST <- fit.growth@Model@GLIST
pt.org <- partable(fit.growth)
pt <- copy(pt.org)
pt$start <- NULL
pt$est <- NULL
pt$se <- NULL
## SNLLS startwaarden
fit.snlls <- growth(pt, data = boot_data, estimator = "gls", do.fit = FALSE,
representation = "RAM")
lavmodel <- fit.snlls@Model
S <- fit.snlls@SampleStats@cov[[1]]
S.inv <- fit.snlls@SampleStats@icov[[1]]
A <- lavmodel@GLIST$A
Fo <- diag(nrow(A))[lavmodel@GLIST$ov.idx, , drop = FALSE]
IA.inv <- lavaan:::lav_matrix_inverse_iminus(A)
FIA.inv <- Fo %*% IA.inv
V <- 0.5 * lavaan:::lav_matrix_duplication_pre_post(S.inv %x% S.inv)
s <- lavaan:::lav_matrix_vech(S)
Vs <- V %*% s
L.S <- lavaan:::lav_model_dmmdpar(lavmodel, target = "S")
G <- lavaan:::lav_matrix_duplication_ginv_pre(FIA.inv %x% FIA.inv) %*% L.S
out <- solve(crossprod(G, V) %*% G, crossprod(G, Vs))
S <- matrix(L.S %*% out, ncol(A), ncol(A))
diag_vals <- diag(S)
upper_vals <- S[upper.tri(S)]
upper_vals <- upper_vals[abs(upper_vals) > 0]
out <- c(diag_vals, upper_vals)
pt$ustart[pt$free %in% 1:7] <- out
# of toch verder met ml
fit.guttman_snlls <- growth(model = pt, data = boot_data,
estimator = "gls", do.fit = TRUE, representation = "LISREL")
#partable(fit.guttman_snlls)
# Hier gaan we verder met ML
res[[2]] <- data.table(replicatie = i, method = "snlls",
iterations = fit.guttman_snlls@optim$iterations)
rbindlist(res, use.names = TRUE, fill = TRUE)
}
# run
set.seed(42)
results_list <- future_lapply(1:n_boot, run_bootstrap_iteration, future.seed = TRUE)
results <- rbindlist(results_list)
# =======
summary_stats <- results[, .(
mean_iter = mean(iterations, na.rm = TRUE),
sd_iter = sd(iterations, na.rm = TRUE),
median_iter = median(iterations, na.rm = TRUE),
n = .N
), by = method]
summary_stats method mean_iter sd_iter median_iter n
<char> <num> <num> <num> <int>
1: default 19.218 1.0562157 19 1000
2: snlls 7.614 0.9142428 8 1000
# startwaarden obv gls, final model obv ml
# > summary_stats
# method mean_iter sd_iter median_iter n
# <char> <num> <num> <num> <int>
# 1: default 28.753 1.8802996 29 1000
# 2: snlls 20.557 0.9260716 21 1000Conclusie
- De gefixeerde lambda’s zijn gebruikt om via de SNLSS methode de covarianties te schatten.
- De slope mean en het intercept worden niet meegenomen bij het bepalen van de startwaarden en moeten nog zonder goede startwaarden geschat worden.
Vergelijking: SNLLS versus Demidenko’s closed-form oplossing
Separable Nonlinear Least Squares (SNLLS) is een schattingsmethode voor modellen waarin sommige parameters lineair en andere niet-lineair voorkomen.
Voor bepaalde balanced mixed models (zoals growth curve modellen) met vaste designmatrix (\(\Lambda\)), homoscedastische fouten, en lineaire structuur kun je de closed-form oplossing van Demidenko gebruiken.
Beide methoden leveren exact dezelfde schattingen wanneer:
- het model een balanced random coefficient model is
- (\(\Lambda\)) is vast en lineair
- meetfouten zijn homoscedastisch
- geen constraints op (\(\Psi\)).
In dat geval kan SNLLS worden uitgevoerd zonder iteraties, en komt overeen met de closed-form ML-oplossing.
SNLLS icm een eenvoudig growth curve model (met fixed lambda’s)
library(lavaan)
library(data.table)
library(future.apply)
n_boot <- 1000
n_workers <- parallel::detectCores(logical = FALSE)
model_growth <- '
# intercept and slope with fixed coefficients
i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
s =~ 0*t1 + 1*t2 + 2*t3 + 3*t4
'
# create selection matrix (assuming a single group)
xm2select <- function(fit, target = "theta") {
lavmodel <- fit@Model
target.idx <- which(names(lavmodel@GLIST) == target)[1] # first group only
# dimension
P <- nrow(lavmodel@GLIST[[target.idx]])
unique.idx <- unique(lavmodel@x.free.idx[[target.idx]])
row.idx <- match(lavmodel@x.free.idx[[target.idx]], unique.idx)
out <- matrix(0, nrow = P*P, ncol = length(unique.idx))
IDX <- cbind(fit@Model@m.free.idx[[target.idx]], row.idx)
out[IDX] <- 1
out
}
#fit <- growth(model_growth, data = Demo.growth)
#partable(fit)
# Set parallel optie
plan(multisession, workers = n_workers)
# Functie per bootstrap replicatie
run_bootstrap_iteration <- function(i) {
set.seed(42 + i)
boot_data <- Demo.growth[sample(nrow(Demo.growth), replace = TRUE), ]
res <- list()
# Default methode
fit.growth <- growth(model_growth, data = boot_data, estimator = "gls",
do.fit = TRUE, representation = "LISREL")
res[[1]] <- data.table(replicatie = i, method = "default",
iterations = fit.growth@optim$iterations)
#GLIST <- fit.growth@Model@GLIST
pt.org <- partable(fit.growth)
pt <- copy(pt.org)
pt$start <- NULL
pt$est <- NULL
pt$se <- NULL
## SNLLS startwaarden
fit.snlls <- growth(pt, data = boot_data, estimator = "gls", do.fit = FALSE,
representation = "RAM")
lavmodel <- fit.snlls@Model
S <- fit.snlls@SampleStats@cov[[1]]
S.inv <- fit.snlls@SampleStats@icov[[1]]
if (fit.snlls@Model@representation == "RAM") {
A <- lavmodel@GLIST$A
Fo <- diag(nrow(A))[lavmodel@GLIST$ov.idx, , drop = FALSE]
IA.inv <- lavaan:::lav_matrix_inverse_iminus(A)
FIA.inv <- Fo %*% IA.inv
V <- 0.5 * lavaan:::lav_matrix_duplication_pre_post(S.inv %x% S.inv)
s <- lavaan:::lav_matrix_vech(S)
Vs <- V %*% s
L.S <- lavaan:::lav_model_dmmdpar(lavmodel, target = "S")
G <- lavaan:::lav_matrix_duplication_ginv_pre(FIA.inv %x% FIA.inv) %*% L.S
out <- solve(crossprod(G, V) %*% G, crossprod(G, Vs))
S <- matrix(L.S %*% out, ncol(A), ncol(A))
out <- solve(crossprod(G, V) %*% G, crossprod(G, Vs))
MEAN <- fit.snlls@SampleStats@mean[[1]]
nmeans <- ncol(lavTech(fit.snlls, "est")$ov.idx)
nu <- rep(0, nmeans)
LAMBDA <- FIA.inv[, (nmeans+1):ncol(FIA.inv)]
NU <- solve(t(LAMBDA) %*% S.inv %*% LAMBDA) %*% t(LAMBDA) %*% S.inv %*% (MEAN - nu)
diag_vals <- diag(S)
upper_vals <- S[upper.tri(S)]
upper_vals <- upper_vals[abs(upper_vals) > 0]
out <- c(diag_vals, upper_vals)
pt$ustart[pt$free %in% 1:7] <- out
pt$ustart[pt$free %in% 8:9] <- NU
} else {
LAMBDA <- lavmodel@GLIST$lambda
# s and V
s.mean <- as.matrix(lavTech(fit.snlls, "wls.obs")[[1]])
V.mean <- lavInspect(fit.snlls, "WLS.V")
# remove means (for now)
s <- s.mean[-c(1,2,3,4)]
V <- V.mean[-c(1,2,3,4), -c(1,2,3,4)]
Vs <- V %*% s
# L.PSI, L.THETA and G2
L.PSI <- xm2select(fit.snlls, target = "psi")
L.THETA <- xm2select(fit.snlls, target = "theta")
G2 <- lav_matrix_duplication_ginv_pre(L.THETA)
# given lambda, solve for the remaining parameters using WLS
G1 <- lav_matrix_duplication_ginv_pre(LAMBDA %x% LAMBDA) %*% L.PSI
G <- cbind(G1, G2)
out <- solve(crossprod(G, V) %*% G, crossprod(G, Vs))
# first 3 are elements of PSI
# variances and covariances (i~~i, s~~s, i~~s)
PSI <- matrix(L.PSI %*% out[1:ncol(L.PSI)], ncol(LAMBDA), ncol(LAMBDA))
# next 4 elements are the diagonal elements of THETA
# residual variances (t1~~t1, etc)
THETA <- matrix(L.THETA %*% out[(1+ncol(L.PSI)):length(out)], nrow(LAMBDA), nrow(LAMBDA))
MEAN <- fit.snlls@SampleStats@mean[[1]]
if (fit.snlls@Model@estimator == "GLS") {
# for GLS, we need S.inv as a weight matrix:
NU <- solve(t(LAMBDA) %*% S.inv %*% LAMBDA) %*% t(LAMBDA) %*% S.inv %*% (MEAN - lavTech(fit.snlls, "est")$nu)
} else if (fit.snlls@Model@estimator == "ML") {
NU <- solve(crossprod(LAMBDA), t(LAMBDA)) %*% (MEAN - lavTech(fit.snlls, "est")$nu)
}
# Dit kan vast handiger.
pt$ustart[pt$free %in% 1:4] <- diag(THETA)
pt$ustart[pt$free %in% 5:7] <- c(diag(PSI), PSI[lower.tri(PSI, diag = FALSE)])
pt$ustart[pt$free %in% 8:9] <- NU
}
pt$free <- pt.org$free
# of toch verder met ml
fit.guttman_snlls <- growth(model = pt, data = boot_data,
estimator = "gls", do.fit = TRUE,
representation = "LISREL")
#partable(fit.guttman_snlls)
res[[2]] <- data.table(replicatie = i, method = "snlls",
iterations = fit.guttman_snlls@optim$iterations)
rbindlist(res, use.names = TRUE, fill = TRUE)
}
# run
set.seed(42)
results_list <- future_lapply(1:n_boot, run_bootstrap_iteration, future.seed = TRUE)
results <- rbindlist(results_list)
# =======
summary_stats <- results[, .(
mean_iter = mean(iterations, na.rm = TRUE),
sd_iter = sd(iterations, na.rm = TRUE),
median_iter = median(iterations, na.rm = TRUE),
n = .N
), by = method]
summary_stats method mean_iter sd_iter median_iter n
<char> <num> <num> <num> <int>
1: default 19.218 1.0562157 19 1000
2: snlls 1.695 0.8710532 1 1000
Non-iterative ML schattingen voor growth curve model (let op assumpties)
model <- '
# intercept and slope with fixed coefficients
i =~ 1*t1 + 1*t2 + 1*t3 + 1*t4
s =~ 0*t1 + 1*t2 + 2*t3 + 3*t4
# zero intercepts
t1 + t2 + t3 + t4 ~ 0*1
# free latent means
i + s ~ 1
# single var.e
t1 ~~ var.e*t1
t2 ~~ var.e*t2
t3 ~~ var.e*t3
t4 ~~ var.e*t4
'
fit <- cfa(model, data = Demo.growth, estimator = "ml",
do.fit = TRUE, representation = "LISREL")
mle_growth_closedform <- function(Y, Lambda) {
N <- nrow(Y)
p <- ncol(Y)
q <- ncol(Lambda)
MEAN <- colMeans(Y)
beta_ml <- solve(crossprod(Lambda), t(Lambda)) %*% MEAN
# Residuals and EE matrix
E <- t(Y) - Lambda %*% beta_ml %*% rep(1, N)
EE <- tcrossprod(E)
Sigma_hat <- EE / N
# Residual variance
LtL_inv <- solve(crossprod(Lambda))
P.L <- Lambda %*% LtL_inv %*% t(Lambda)
IL <- diag(p) - P.L
s2 <- sum((IL %*% E)^2) / (N * (p - q))
Theta_hat <- diag(rep(s2, p))
# Projection matrix M
Ti <- diag(rep(s2, p))
M <- solve(t(Lambda) %*% Ti %*% Lambda) %*% t(Lambda) %*% Ti
Psi_hat <- M %*% ((EE)/N - Ti) %*% t(M)
list(
beta = as.vector(beta_ml),
theta = s2,
Theta = Theta_hat,
Psi = Psi_hat,
Sigma = Lambda %*% Psi_hat %*% t(Lambda) + Theta_hat
)
}
( est <- lavTech(fit, "est") )$lambda
[,1] [,2]
[1,] 1 0
[2,] 1 1
[3,] 1 2
[4,] 1 3
$theta
[,1] [,2] [,3] [,4]
[1,] 0.6223442 0.0000000 0.0000000 0.0000000
[2,] 0.0000000 0.6223442 0.0000000 0.0000000
[3,] 0.0000000 0.0000000 0.6223442 0.0000000
[4,] 0.0000000 0.0000000 0.0000000 0.6223442
$psi
[,1] [,2]
[1,] 1.9279467 0.6266953
[2,] 0.6266953 0.5764599
$nu
[,1]
[1,] 0
[2,] 0
[3,] 0
[4,] 0
$alpha
[,1]
[1,] 0.6171569
[2,] 1.0051929
Y <- fit@Data@X[[1]]
LAMBDA <- est$lambda
mle_growth_closedform(Y, LAMBDA)$beta
[1] 0.6171568 1.0051928
$theta
[1] 0.6223442
$Theta
[,1] [,2] [,3] [,4]
[1,] 0.6223442 0.0000000 0.0000000 0.0000000
[2,] 0.0000000 0.6223442 0.0000000 0.0000000
[3,] 0.0000000 0.0000000 0.6223442 0.0000000
[4,] 0.0000000 0.0000000 0.0000000 0.6223442
$Psi
[,1] [,2]
[1,] 1.9279466 0.6266952
[2,] 0.6266952 0.5764596
$Sigma
[,1] [,2] [,3] [,4]
[1,] 2.550291 2.554642 3.181337 3.808032
[2,] 2.554642 4.380141 4.960951 6.164106
[3,] 3.181337 4.960951 7.362910 8.520180
[4,] 3.808032 6.164106 8.520180 11.498599
Assumpties
- Elke persoon moet op exact dezelfde tijdstippen gemeten zijn (bv. op leeftijd 1, 2, 3, 4).
- Er mogen geen missings zijn.
- De factorladingen staan vast.
- Alle meetmomenten hebben dezelfde meetfoutvariantie.
- Meetfouten zijn niet met elkaar gecorreleerd.
- Geen voorspellers in het model.
- De data is MVN.
Auto-encoder CFA met structural part - Unsupervised
library(torch)
library(MASS)
library(vegan) Loading required package: permute
Loading required package: lattice
set.seed(42)
n_factors <- 3
gen_data <- function(n = 1000L, theta = 0.2) {
# genereer ware factoren (orthogonaal)
eta_true <- matrix(NA, nrow = n, ncol = n_factors)
colnames(eta_true) <- paste0("f", 1:n_factors)
# f2 en f3 standaardnormaal
#eta_true[, 2:3] <- matrix(rnorm(n * 2), ncol = 2)
Sigma <- matrix(c(1, 0.3, 0.3, 1), 2, 2)
eta_true[, 2:3] <- MASS::mvrnorm(n, mu = c(0, 0), Sigma = Sigma)
f2 <- eta_true[, 2]
f3 <- eta_true[, 3]
# f1 = lineaire combinatie + interactie van f2 en f3
b2 <- 0.5; b3 <- -0.4
f1 <- b2*f2 + b3*f3 + rnorm(n, 0, sqrt(1.8))
eta_true[, 1] <- f1
# meetfout instellen
lambda <- sqrt(1 - theta)
resid_sd <- sqrt(theta)
generate_items <- function(latent, resid_sd, lambda) {
error <- matrix(rnorm(n * 3, sd = resid_sd), nrow = n)
matrix(lambda * rep(latent, times = 3), nrow = n) + error
}
# items genereren
x <- generate_items(eta_true[, 1], resid_sd, lambda)
colnames(x) <- paste0("x", 1:3)
y <- generate_items(eta_true[, 2], resid_sd, lambda)
colnames(y) <- paste0("y", 1:3)
z <- generate_items(eta_true[, 3], resid_sd, lambda)
colnames(z) <- paste0("z", 1:3)
X <- data.frame(x, y, z)
list(X=X, eta_true=eta_true)
}
out <- gen_data()
X <- out$X
eta_true <- out$eta_true
X_scaled <- scale(X)
n_items <- ncol(X_scaled)
eta_tensor <- torch_tensor(eta_true, dtype = torch_float())
# -------------------------------------------------------------------------
x_tensor <- torch_tensor(as.matrix(X_scaled), dtype = torch_float())
x_noisy <- x_tensor + torch_randn_like(x_tensor) * 0.2 # voeg wat extra noise toe,
# om het signaal nog beter van de noise te kunnen onderscheiden
autoencoder_nn <- nn_module(
"StructuredAutoEncoder",
initialize = function(input_dim, latent_dim, hidden = 128, dropout_p = 0) {
self$encoders <- nn_module_list(lapply(1:3, function(i) {
nn_sequential(
nn_linear(3, hidden), # van 3 items naar hidden laag
nn_gelu(), # activatiefunctie
nn_dropout(dropout_p), # regularisatie (wordt nu niet gebruikt)
nn_linear(hidden, 1) # naar 1 latente factor (f1, f2, f3)
)
}))
self$decoders <- nn_module_list(lapply(1:3, function(i) {
nn_sequential(
nn_linear(1, hidden),
nn_gelu(),
nn_dropout(dropout_p),
nn_linear(hidden, 3)
)
}))
# f1_predictor: leert hoe f1 ongeveer voorspeld wordt uit f2 en f3
self$f1_predictor <- nn_sequential(
nn_linear(2, hidden),
nn_gelu(),
nn_linear(hidden, 1)
)
# Dit zorgt ervoor dat f1 wordt afgeleid uit andere factoren i.p.v. direct uit data
self$f1_from_f2f3 <- nn_sequential(
nn_linear(2, hidden),
nn_gelu(),
nn_linear(hidden, 1)
)
},
forward = function(x) {
x1 <- x[, 1:3] # welke items horen bij welke factor
x2 <- x[, 4:6]
x3 <- x[, 7:9]
f2 <- self$encoders[[2]](x2)$squeeze(2)
f3 <- self$encoders[[3]](x3)$squeeze(2)
# f1 wordt niet direct uit x1 gehaald, maar afgeleid van f2 en f3
f1 <- self$f1_from_f2f3(torch_stack(list(f2, f3), dim = 2))$squeeze(2)
# f1_hat is voorspelde f1 voor het berekenen van de structurele loss
f1_hat <- self$f1_predictor(torch_stack(list(f2, f3), dim = 2))$squeeze(2)
# combineer alles tot een matrix
z <- torch_stack(list(f1, f2, f3), dim = 2)
# reconstructie
r1 <- self$decoders[[1]](f1$unsqueeze(2))
r2 <- self$decoders[[2]](f2$unsqueeze(2))
r3 <- self$decoders[[3]](f3$unsqueeze(2))
x_recon <- torch_cat(list(r1, r2, r3), dim = 2)
return(list(z = z, x_recon = x_recon, f1_hat = f1_hat))
}
)
loss_ae <- function(x_true, output, lambda_struct = 1e-2) {
x_recon <- output$x_recon
z <- output$z
f1_hat <- output$f1_hat$unsqueeze(2)
f1 <- z[, 1, drop = FALSE]
# hoe goed worden de originele items gereconstrueerd?
recon_loss <- nnf_mse_loss(x_recon, x_true)
# Dit helpt het model om f1 niet los, maar afhankelijk van de andere factoren te leren
struct_loss <- nnf_mse_loss(f1_hat, f1)
total <- recon_loss + lambda_struct * struct_loss
return(list(total = total, recon = recon_loss, struct = struct_loss))
}
# Model trainen
model <- autoencoder_nn(input_dim = 9, latent_dim = 3)
optimizer <- optim_adam(model$parameters, lr = 0.001, weight_decay = 1e-4)
early_stopping <- 10
min_delta <- 1e-6
best_loss <- Inf
counter <- 0
for (epoch in 1:2000) {
model$train()
optimizer$zero_grad()
output <- model(x_noisy)
loss_list <- loss_ae(x_tensor, output)
loss_list$total$backward()
optimizer$step()
current_loss <- loss_list$total$item()
if (current_loss < best_loss - min_delta) {
best_loss <- current_loss
counter <- 0
} else {
counter <- counter + 1
if (counter >= early_stopping) {
cat("early stopping triggered at epoch", epoch, "with loss:", round(best_loss, 4), "\n")
break
}
}
if (epoch %% 100 == 0) {
cat("Epoch", epoch,
"Recon:", round(loss_list$recon$item(), 4),
"\n")
}
}Epoch 100 Recon: 0.3993
Epoch 200 Recon: 0.3976
Epoch 300 Recon: 0.3967
Epoch 400 Recon: 0.3955
Epoch 500 Recon: 0.3943
Epoch 600 Recon: 0.393
Epoch 700 Recon: 0.3908
Epoch 800 Recon: 0.3885
Epoch 900 Recon: 0.3875
Epoch 1000 Recon: 0.3867
Epoch 1100 Recon: 0.3857
Epoch 1200 Recon: 0.3849
Epoch 1300 Recon: 0.3843
Epoch 1400 Recon: 0.3837
Epoch 1500 Recon: 0.3832
Epoch 1600 Recon: 0.3827
early stopping triggered at epoch 1691 with loss: 0.3822
# -------------------------------------------------------------------------
z_hat <- as_array(output$z)
colnames(z_hat) <- paste0("f", 1:n_factors)
round(cor(z_hat, eta_true), 3) f1 f2 f3
f1 -0.417 -0.530 0.371
f2 -0.247 -0.945 -0.290
f3 -0.163 0.291 0.945
# gelijke sprijding?
apply(z_hat, 2, sd) f1 f2 f3
0.1203229 1.7440733 1.7638427
proc <- procrustes(X = eta_true, Y = z_hat, scale = TRUE)
summary(proc)
Call:
procrustes(X = eta_true, Y = z_hat, scale = TRUE)
Number of objects: 1000 Number of dimensions: 3
Procrustes sum of squares:
2140.72
Procrustes root mean squared error:
1.46312
Quantiles of Procrustes errors:
Min 1Q Median 3Q Max
0.01553668 0.64000213 1.02168243 1.65550602 4.45794262
Rotation matrix:
[,1] [,2] [,3]
[1,] -0.8720574 0.38391289 -0.30352387
[2,] -0.3856182 -0.92090331 -0.05688335
[3,] -0.3013544 0.06743878 0.95112436
Translation of averages:
[,1] [,2] [,3]
[1,] 0.00655485 0.08592021 0.06246092
Scaling of target:
[1] 0.5771842
z_rot <- proc$Yrot
df_true <- as.data.frame(eta_true)
colnames(df_true) <- paste0("f", 1:n_factors)
df_rot <- as.data.frame(z_rot)
colnames(df_rot) <- paste0("f", 1:n_factors)
form <- as.formula(f1 ~ f2 + f3)
## De AE kan ook andere structuren leren zolang de reconstructie klopt.
## Vandaar dat de correlatie tussen eta_true en z_hat zo hoog is, maar de
## regressiecoefs afwijken
# truth: b2 <- 0.5; b3 <- -0.4
summary(lm(form, data = df_rot))
Call:
lm(formula = form, data = df_rot)
Residuals:
Min 1Q Median 3Q Max
-0.206236 -0.032189 -0.000457 0.031088 0.244924
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.524e-17 1.517e-03 0.0 1
f2 5.076e-01 1.786e-03 284.2 <2e-16 ***
f3 -4.031e-01 1.706e-03 -236.2 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.04797 on 997 degrees of freedom
Multiple R-squared: 0.9899, Adjusted R-squared: 0.9899
F-statistic: 4.885e+04 on 2 and 997 DF, p-value: < 2.2e-16
Conclusie
- CFA model met 3 gecorreleerde factoren
- De lagere correlatie voor f1, komt door de veel lagere sd in z_hat wat zorgt voor een kunstmatig lage correlatie: cor(z, \(\eta\)) = cov(z, \(\eta\)) / sd(z) \(\times\) sd(\(\eta\))
- De AE bepaalt de factor scores goed, mits het criterium is dat de regressiecoëfficienten correct wordt gereconstrueerd.
Auto-encoder CFA zonder structural part - Unsupervised
library(torch)
library(MASS)
library(vegan)
library(lavaan)
library(future.apply)
# ------------------------------------------------------------------------------
n_factors <- 3
n_items <- 5
gen_data <- function(n = 1000L, theta = 0.5) {
eta_true <- matrix(NA, nrow = n, ncol = n_factors)
colnames(eta_true) <- paste0("f", 1:n_factors)
Sigma <- matrix(0.3, nrow = n_factors, ncol = n_factors)
diag(Sigma) <- 1
eta_true[, 1:3] <- MASS::mvrnorm(n, mu = rep(0, n_factors), Sigma = Sigma)
generate_items <- function(latent, resid_sd, lambda) {
error <- matrix(rnorm(n * n_items, sd = resid_sd), nrow = n)
matrix(lambda * rep(latent, times = n_items), nrow = n) + error
}
lambda <- sqrt(1 - theta)
resid_sd <- sqrt(theta)
x <- generate_items(eta_true[, 1], resid_sd, lambda)
y <- generate_items(eta_true[, 2], resid_sd, lambda)
z <- generate_items(eta_true[, 3], resid_sd, lambda)
colnames(x) <- paste0("x", 1:n_items)
colnames(y) <- paste0("y", 1:n_items)
colnames(z) <- paste0("z", 1:n_items)
X <- data.frame(x, y, z)
list(X = X, eta_true = eta_true)
}
# -------------------------------------------------------------------------
generate_measurement_model <- function(n_factors = 3, n_items = 5) {
factor_names <- paste0("f", 1:n_factors)
item_prefixes <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
measurement_parts <- mapply(function(f, p) {
items <- paste0(p, 1:n_items)
paste(f, "=~", paste(items, collapse = " + "))
}, factor_names, item_prefixes)
model_string <- paste(measurement_parts, collapse = "\n")
return(model_string)
}
# ------------------------------------------------------------------------------
autoencoder_nn <- nn_module(
"StructuredAutoEncoder",
initialize = function(n_factors, n_items, hidden = 128, dropout_p = 0.05) {
self$n_factors <- n_factors
self$n_items <- n_items
self$total_items <- n_factors * n_items
self$encoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(n_items, hidden),
nn_batch_norm1d(hidden),
nn_gelu(),
nn_linear(hidden, hidden),
nn_batch_norm1d(hidden),
nn_gelu(),
nn_dropout(dropout_p),
nn_linear(hidden, 1)
)
}))
self$decoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(1, hidden),
nn_gelu(),
#nn_linear(hidden, hidden),
#nn_gelu(),
nn_dropout(dropout_p),
nn_linear(hidden, n_items)
)
}))
},
forward = function(x) {
factor_inputs <- list()
start_idx <- 1
for (i in 1:self$n_factors) {
end_idx <- start_idx + self$n_items - 1
x_block <- x[, start_idx:end_idx]
factor_inputs[[i]] <- x_block
start_idx <- end_idx + 1
}
z_list <- lapply(1:self$n_factors, function(i) {
self$encoders[[i]](factor_inputs[[i]])$squeeze(2)
})
z_raw <- torch_stack(z_list, dim = 2)
# Normaliseer per factor
z <- (z_raw - z_raw$mean(dim = 1, keepdim = TRUE)) / (z_raw$std(dim = 1, keepdim = TRUE) + 1e-6)
r_list <- lapply(1:self$n_factors, function(i) {
self$decoders[[i]](z_list[[i]]$unsqueeze(2))
})
x_recon <- torch_cat(r_list, dim = 2)
return(list(z = z, z_raw = z_raw, x_recon = x_recon))
}
)
loss_ae <- function(x_true, output, lambda_norm = 0.01, loss_type = c("huber", "mse", "mae")) {
x_recon <- output$x_recon
z_raw <- output$z_raw # gebruik ongenormaliseerd voor penalty
loss_type <- match.arg(loss_type)
recon_loss <- switch(loss_type,
huber = nnf_smooth_l1_loss(x_recon, x_true),
mse = nnf_mse_loss(x_recon, x_true),
mae = torch_mean(torch_abs(x_recon - x_true))
)
z_penalty <- torch_mean(z_raw^2)
total_loss <- recon_loss + lambda_norm * z_penalty
return(list(total = total_loss, recon = recon_loss, z_penalty = z_penalty))
}
# ------------------------------------------------------------------------------
run_simulation_parallel <- function(n_sim = 100, n = 1000) {
sim_indices <- seq_len(n_sim)
results_list <- future_lapply(sim_indices, function(i) {
out <- gen_data(n)
X <- out$X
eta_true <- out$eta_true
X_scaled <- scale(X)
x_tensor <- torch_tensor(as.matrix(X_scaled), dtype = torch_float())
x_noisy <- x_tensor + torch_randn_like(x_tensor) * 0.1
model <- autoencoder_nn(n_factors = n_factors, n_items = n_items)
optimizer <- optim_adam(model$parameters, lr = 0.001, weight_decay = 1e-4)
early_stopping <- 20
min_delta <- 1e-6
best_loss <- Inf
counter <- 0
for (epoch in 1:1000) {
model$train()
optimizer$zero_grad()
output <- model(x_noisy)
loss_list <- loss_ae(x_tensor, output)
loss_list$total$backward()
optimizer$step()
current_loss <- loss_list$total$item()
if (current_loss < best_loss - min_delta) {
best_loss <- current_loss
counter <- 0
} else {
counter <- counter + 1
if (counter >= early_stopping) break
}
}
z_hat <- as_array(output$z)
colnames(z_hat) <- paste0("f", 1:n_factors)
proc <- procrustes(X = eta_true, Y = z_hat, scale = TRUE)
z_rot <- proc$Yrot
df_rot <- as.data.frame(z_rot)
colnames(df_rot) <- paste0("f", 1:n_factors)
beta_ae <- coef(lm(f1 ~ f2 + f3 + f2:f3, data = df_rot))
df_true <- as.data.frame(eta_true)
colnames(df_true) <- paste0("f", 1:n_factors)
beta_true <- coef(lm(f1 ~ f2 + f3 + f2:f3, data = df_true))
measurement_model <- generate_measurement_model(n_factors = n_factors, n_items = n_items)
model.sam <- paste(measurement_model, '\nf1 ~ 1 + f2 + f3 + f2:f3')
fit <- try(sam(model.sam, data = X, se = "none"), silent = TRUE)
beta_sam <- if (!inherits(fit, "try-error")) {
PT <- parTable(fit)
PT[PT$op %in% c("~", "~1"), "est"][1:4]
} else {
rep(NA, 4)
}
list(beta_true = beta_true, beta_ae = beta_ae, beta_sam = beta_sam)
}, future.seed = TRUE)
beta_true_mat <- do.call(rbind, lapply(results_list, `[[`, "beta_true"))
beta_ae_mat <- do.call(rbind, lapply(results_list, `[[`, "beta_ae"))
beta_sam_mat <- do.call(rbind, lapply(results_list, `[[`, "beta_sam"))
list(beta_true = beta_true_mat, beta_ae = beta_ae_mat, beta_sam = beta_sam_mat)
}
# ------------------------------------------------------------------------------
plan(multisession, workers = parallel::detectCores() - 1)
set.seed(42)
res <- run_simulation_parallel(n_sim = 100)
# -------------------------------------------------------------------------
colnames(res$beta_true) <- colnames(res$beta_ae) <- colnames(res$beta_sam[, 1:4]) <- c("Intercept", "f2", "f3", "f2:f3")
true_mean <- colMeans(res$beta_true, na.rm = TRUE)
ae_mean <- colMeans(res$beta_ae, na.rm = TRUE)
sam_mean <- colMeans(res$beta_sam[, 1:4], na.rm = TRUE)
# Bias
bias_ae <- ae_mean - true_mean
bias_sam <- sam_mean - true_mean
# Variantie
var_ae <- apply(res$beta_ae, 2, var, na.rm = TRUE)
var_sam <- apply(res$beta_sam[, 1:4], 2, var, na.rm = TRUE)
# RMSE
rmse <- function(estimates, true_vals) {
sqrt(colMeans((estimates - matrix(true_vals, nrow = nrow(estimates), ncol = length(true_vals), byrow = TRUE))^2, na.rm = TRUE))
}
rmse_ae <- rmse(res$beta_ae, true_mean)
rmse_sam <- rmse(res$beta_sam[, 1:4], true_mean)
result_df <- data.frame(
# coef = names(true_mean),
bias_ae = round(bias_ae, 4),
bias_sam = round(bias_sam, 4),
var_ae = round(var_ae, 4),
var_sam = round(var_sam, 4),
rmse_ae = round(rmse_ae, 4),
rmse_sam = round(rmse_sam, 4)
)
result_df
# theta = 0.2
# bias_ae bias_sam var_ae var_sam rmse_ae rmse_sam
# Intercept 0.0001 0.0000 0.0001 0.0001 0.0087 0.0099
# f2 -0.0108 -0.0010 0.0013 0.0011 0.0375 0.0329
# f3 -0.0068 -0.0011 0.0012 0.0011 0.0351 0.0324
# f2:f3 -0.0021 -0.0015 0.0010 0.0012 0.0320 0.0349
# theta = 0.5
# bias_ae bias_sam var_ae var_sam rmse_ae rmse_sam
# Intercept 0.0016 0.0015 0.0000 0.0001 0.0071 0.0084
# f2 -0.0369 -0.0029 0.0009 0.0015 0.0474 0.0392
# f3 -0.0365 0.0001 0.0008 0.0014 0.0466 0.0376
# f2:f3 -0.0011 -0.0011 0.0011 0.0029 0.0336 0.0538
# theta = 0.5 + Huber loss functie
# bias_ae bias_sam var_ae var_sam rmse_ae rmse_sam
# Intercept 0.0001 0.0002 0.0001 0.0001 0.0074 0.0088
# f2 -0.0352 -0.0025 0.0011 0.0016 0.0485 0.0398
# f3 -0.0372 -0.0020 0.0009 0.0015 0.0473 0.0391
# f2:f3 -0.0021 -0.0048 0.0013 0.0032 0.0354 0.0564Auto-encoder CFA met crossloadings
library(torch)
library(MASS)
library(vegan)
library(lavaan)
# ------------------------------------------------------------------------------
gen_data <- function(n = 1000L, n_factors = 3, n_items = 3, crossloadings = TRUE) {
total_items <- n_factors * n_items
Sigma <- matrix(0.3, nrow = n_factors, ncol = n_factors)
diag(Sigma) <- 1
eta_true <- MASS::mvrnorm(n, mu = rep(0, n_factors), Sigma = Sigma)
colnames(eta_true) <- paste0("f", 1:n_factors)
Lambda <- matrix(0, nrow = total_items, ncol = n_factors)
for (f in 1:n_factors) {
row_start <- (f - 1) * n_items + 1
row_end <- f * n_items
Lambda[row_start:row_end, f] <- seq(0.4, 0.8, length.out = n_items)
}
# Voeg kruisladingen toe
if (crossloadings && n_factors >= 3) {
Lambda[1, 2] <- 0.3 # eerste item van f1 laadt ook op f2
Lambda[total_items, 1] <- 0.25 # laatste item van f3 laadt ook op f1
}
resid_sd <- sqrt(1 - rowSums(Lambda^2))
generate_items_with_crossloading <- function(eta, Lambda, resid_sd) {
n <- nrow(eta)
n_items <- nrow(Lambda)
data <- matrix(NA, nrow = n, ncol = n_items)
for (i in 1:n_items) {
true_score <- eta %*% Lambda[i, ]
noise <- rnorm(n, mean = 0, sd = resid_sd[i])
data[, i] <- true_score + noise
}
return(data)
}
X <- generate_items_with_crossloading(eta_true, Lambda, resid_sd)
# Genereer kolomnamen op basis van factorletters
factor_prefixes <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
colnames(X) <- unlist(lapply(1:n_factors, function(f) {
paste0(factor_prefixes[f], 1:n_items)
}))
list(X = X, eta_true = eta_true, Lambda = Lambda)
}
# -------------------------------------------------------------------------
generate_measurement_model <- function(n_factors = 3, n_items = 5) {
factor_names <- paste0("f", 1:n_factors)
item_prefixes <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
measurement_parts <- mapply(function(f, p) {
items <- paste0(p, 1:n_items)
paste(f, "=~", paste(items, collapse = " + "))
}, factor_names, item_prefixes)
model_string <- paste(measurement_parts, collapse = "\n")
return(model_string)
}
# ------------------------------------------------------------------------------
autoencoder_nn <- nn_module(
"StructuredAutoEncoder",
initialize = function(n_factors, n_items, hidden = 128, dropout_p = 0.05) {
self$n_factors <- n_factors
self$n_items <- n_items
self$total_items <- n_factors * n_items
self$encoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(n_items, hidden),
nn_gelu(),
nn_dropout(dropout_p),
nn_linear(hidden, 1)
)
}))
self$decoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(1, hidden),
nn_gelu(),
nn_dropout(dropout_p),
nn_linear(hidden, n_items)
)
}))
},
forward = function(x) {
factor_inputs <- list()
start_idx <- 1
for (i in 1:self$n_factors) {
end_idx <- start_idx + self$n_items - 1
factor_inputs[[i]] <- x[, start_idx:end_idx]
start_idx <- end_idx + 1
}
# Encoding
z_list <- lapply(1:self$n_factors, function(i) {
self$encoders[[i]](factor_inputs[[i]])$squeeze(2)
})
z <- torch_stack(z_list, dim = 2)
# Decoding
r_list <- lapply(1:self$n_factors, function(i) {
self$decoders[[i]](z_list[[i]]$unsqueeze(2))
})
x_recon <- torch_cat(r_list, dim = 2)
return(list(z = z, x_recon = x_recon))
}
)
loss_ae <- function(x_true, output) {
x_recon <- output$x_recon
recon_loss <- nnf_mse_loss(x_recon, x_true)
return(list(total = recon_loss, recon = recon_loss))
}
# ------------------------------------------------------------------------------
# Simulatie
run_simulation_parallel <- function(n_sim = 100, n = 1000, n_factors = 3,
n_items = 5) {
sim_indices <- seq_len(n_sim)
results_list <- future_lapply(sim_indices, function(i) {
out <- gen_data(n, n_factors = n_factors, n_items = n_items, crossloadings = TRUE)
X <- out$X
eta_true <- out$eta_true
X_scaled <- scale(X)
x_tensor <- torch_tensor(as.matrix(X_scaled), dtype = torch_float())
x_noisy <- x_tensor + torch_randn_like(x_tensor) * 0.1
model <- autoencoder_nn(n_factors = n_factors, n_items = n_items)
optimizer <- optim_adam(model$parameters, lr = 0.001, weight_decay = 1e-4)
early_stopping <- 20
min_delta <- 1e-6
best_loss <- Inf
counter <- 0
for (epoch in 1:1000) {
model$train()
optimizer$zero_grad()
output <- model(x_noisy)
loss_list <- loss_ae(x_tensor, output)
loss_list$total$backward()
optimizer$step()
current_loss <- loss_list$total$item()
if (current_loss < best_loss - min_delta) {
best_loss <- current_loss
counter <- 0
} else {
counter <- counter + 1
if (counter >= early_stopping) break
}
}
z_hat <- as_array(output$z)
colnames(z_hat) <- paste0("f", 1:n_factors)
proc <- procrustes(X = eta_true, Y = z_hat, scale = TRUE)
z_rot <- proc$Yrot
df_rot <- as.data.frame(z_rot)
colnames(df_rot) <- paste0("f", 1:n_factors)
beta_ae <- coef(lm(f1 ~ f2 + f3 + f2:f3, data = df_rot))
df_true <- as.data.frame(eta_true)
colnames(df_true) <- paste0("f", 1:n_factors)
beta_true <- coef(lm(f1 ~ f2 + f3 + f2:f3, data = df_true))
measurement_model <- generate_measurement_model(n_factors = n_factors, n_items = n_items)
model.sam <- paste(measurement_model, '\nf1 ~ 1 + f2 + f3 + f2:f3')
fit <- try(sam(model.sam, data = X, se = "none"), silent = TRUE)
beta_sam <- if (!inherits(fit, "try-error")) {
PT <- parTable(fit)
PT[PT$op %in% c("~", "~1"), "est"][1:4]
} else {
rep(NA, 4)
}
list(beta_true = beta_true, beta_ae = beta_ae, beta_sam = beta_sam)
}, future.seed = TRUE)
# Combineer resultaten
beta_true_mat <- do.call(rbind, lapply(results_list, `[[`, "beta_true"))
beta_ae_mat <- do.call(rbind, lapply(results_list, `[[`, "beta_ae"))
beta_sam_mat <- do.call(rbind, lapply(results_list, `[[`, "beta_sam"))
list(beta_true = beta_true_mat, beta_ae = beta_ae_mat, beta_sam = beta_sam_mat)
}
# ------------------------------------------------------------------------------
library(future.apply)
plan(multisession, workers = parallel::detectCores() - 1)
set.seed(123)
n_factors <- 3
n_items <- 5
res <- run_simulation_parallel(n_sim = 100)
colnames(res$beta_true) <- colnames(res$beta_ae) <- colnames(res$beta_sam[, 1:4]) <- c("Intercept", "f2", "f3", "f2:f3")
# Gemiddelde schattingen
true_mean <- colMeans(res$beta_true, na.rm = TRUE)
ae_mean <- colMeans(res$beta_ae, na.rm = TRUE)
sam_mean <- colMeans(res$beta_sam[, 1:4], na.rm = TRUE)
# Bias
bias_ae <- ae_mean - true_mean
bias_sam <- sam_mean - true_mean
# Variantie
var_ae <- apply(res$beta_ae, 2, var, na.rm = TRUE)
var_sam <- apply(res$beta_sam[, 1:4], 2, var, na.rm = TRUE)
# RMSE
rmse <- function(estimates, true_vals) {
sqrt(colMeans((estimates - matrix(true_vals, nrow = nrow(estimates), ncol = length(true_vals), byrow = TRUE))^2, na.rm = TRUE))
}
rmse_ae <- rmse(res$beta_ae, true_mean)
rmse_sam <- rmse(res$beta_sam[, 1:4], true_mean)
# Combineer alles in een data.frame
result_df <- data.frame(
# coef = names(true_mean),
bias_ae = round(bias_ae, 4),
bias_sam = round(bias_sam, 4),
var_ae = round(var_ae, 4),
var_sam = round(var_sam, 4),
rmse_ae = round(rmse_ae, 4),
rmse_sam = round(rmse_sam, 4)
)
result_df
# bias_ae bias_sam var_ae var_sam rmse_ae rmse_sam
# Intercept -0.0012 -0.0012 0.0000 0.0000 0.0067 0.0066
# f2 0.0031 0.0446 0.0015 0.0035 0.0393 0.0736
# f3 0.0149 0.2283 0.0011 0.0041 0.0361 0.2371
# f2:f3 0.0009 -0.0080 0.0011 0.0150 0.0337 0.1221Auto-encoder CFA - benchmark beta’s AE, regression en Bartlett
n_factors <- 3
n_items <- 5
n <- 1000
gen_data <- function(n = 1000, theta = 0.5, n_factors = 3, n_items = 5) {
Sigma <- matrix(theta, n_factors, n_factors)
diag(Sigma) <- 1
eta <- MASS::mvrnorm(n, mu = rep(0, n_factors), Sigma = Sigma)
Lambda <- matrix(0, nrow = n_factors * n_items, ncol = n_factors)
for (j in 1:n_factors) {
rows <- ((j - 1) * n_items + 1):(j * n_items)
Lambda[rows, j] <- runif(n_items, 0.7, 0.9)
}
eps <- matrix(rnorm(n * nrow(Lambda)), n, nrow(Lambda))
X <- eta %*% t(Lambda) + eps
prefix <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
colnames(X) <- unlist(lapply(prefix, function(p) paste0(p, 1:n_items)))
list(X = X, eta_true = eta)
}
autoencoder_nldae <- nn_module(
"NoiseLearningAutoEncoder",
initialize = function(n_factors, n_items, hidden = 128, dropout_p = 0.00) {
self$n_factors <- n_factors
self$n_items <- n_items
self$total_items <- n_factors * n_items
self$encoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(n_items, hidden), nn_gelu(),
nn_dropout(dropout_p), nn_linear(hidden, 1)
)
}))
self$decoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(1, hidden), nn_gelu(),
nn_dropout(dropout_p), nn_linear(hidden, n_items)
)
}))
self$noise_predictors <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(n_items, hidden), nn_gelu(),
nn_linear(hidden, n_items)
)
}))
self$z_decoder <- nn_sequential(
nn_linear(n_factors, hidden), nn_gelu(),
nn_dropout(dropout_p), nn_linear(hidden, n_factors)
)
},
forward = function(x_true, x_noisy) {
factor_inputs <- list()
start_idx <- 1
for (i in 1:self$n_factors) {
end_idx <- start_idx + self$n_items - 1
factor_inputs[[i]] <- x_noisy[, start_idx:end_idx]
start_idx <- end_idx + 1
}
z_list <- lapply(1:self$n_factors, function(i) {
self$encoders[[i]](factor_inputs[[i]])$squeeze(2)
})
z <- torch_stack(z_list, dim = 2)
r_list <- lapply(1:self$n_factors, function(i) {
self$decoders[[i]](z_list[[i]]$unsqueeze(2))
})
x_recon <- torch_cat(r_list, dim = 2)
noise_hat_list <- lapply(1:self$n_factors, function(i) {
self$noise_predictors[[i]](factor_inputs[[i]])
})
x_noise_hat <- torch_cat(noise_hat_list, dim = 2)
z_recon <- self$z_decoder(z)
list(z = z, x_recon = x_recon, x_noise_hat = x_noise_hat, z_recon = z_recon)
}
)
loss_nldae <- function(x_true, x_noisy, output,
lambda_noise = 0.2, lambda_z = 0.0001, lambda_zrecon = 0.02) {
recon_loss <- nnf_mse_loss(output$x_recon, x_true)
noise_loss <- nnf_mse_loss(output$x_noise_hat, x_noisy - x_true)
z_penalty <- torch_mean(output$z^2)
z_recon_loss <- nnf_mse_loss(output$z_recon, output$z)
total_loss <- recon_loss + lambda_noise * noise_loss + lambda_z * z_penalty + lambda_zrecon * z_recon_loss
list(total = total_loss, recon = recon_loss)
}
get_ae_coefs <- function(X_scaled, n_factors, n_items, eta_true) {
x_tensor <- torch_tensor(as.matrix(X_scaled), dtype = torch_float())
model <- autoencoder_nldae(n_factors, n_items)
optimizer <- optim_adam(model$parameters, lr = 1e-4, weight_decay = 1e-4)
best_loss <- Inf; counter <- 0
for (epoch in 1:2000) {
model$train(); optimizer$zero_grad()
noise <- torch_normal(mean = 0, std = 0.1, size = x_tensor$size())
x_noisy <- x_tensor + noise
output <- model(x_true = x_tensor, x_noisy = x_noisy)
loss_list <- loss_nldae(x_tensor, x_noisy, output)
loss_list$total$backward(); optimizer$step()
current_loss <- loss_list$total$item()
if (current_loss < best_loss - 1e-6) { best_loss <- current_loss; counter <- 0 } else { counter <- counter + 1 }
if (counter >= 20) break
}
out <- model(x_true = x_tensor, x_noisy = x_tensor)
z_hat <- as_array(out$z)
proc <- vegan::procrustes(X = eta_true, Y = z_hat, scale = TRUE)
z_rot <- proc$Yrot
df_rot <- as.data.frame(z_rot)
colnames(df_rot) <- paste0("f", 1:n_factors)
coef(lm(f1 ~ f2 + f3, data = df_rot))
}
generate_measurement_model <- function(n_factors = 3, n_items = 5) {
factor_names <- paste0("f", 1:n_factors)
item_prefixes <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
measurement_parts <- mapply(function(f, p) {
items <- paste0(p, 1:n_items)
paste(f, "=~", paste(items, collapse = " + "))
}, factor_names, item_prefixes)
model_string <- paste(measurement_parts, collapse = "\n")
return(model_string)
}
# regressie met Regressie factor scores
get_regression_coefs <- function(X) {
model_txt <- generate_measurement_model(n_factors, n_items)
model_txt <- paste(model_txt, '\nf1 ~ 1 + f2 + f3')
fit <- lavaan::cfa(model_txt, data = X, se = "none")
z <- lavPredict(fit, method = "regression")
coef(lm(f1 ~ f2 + f3, data = as.data.frame(z)))
}
# regressie met Bartlett factor scores
get_bartlett_coefs <- function(X) {
model_txt <- generate_measurement_model(n_factors, n_items)
model_txt <- paste(model_txt, '\nf1 ~ 1 + f2 + f3')
fit <- lavaan::cfa(model_txt, data = X, se = "none")
z <- lavPredict(fit, method = "Bartlett")
coef(lm(f1 ~ f2 + f3, data = as.data.frame(z)))
}
# run simualatie
simulate_once <- function() {
data <- gen_data(n, theta = 0.5)
X <- data$X
eta <- data$eta_true
colnames(eta) <- paste0("f", 1:n_factors)
beta_true <- coef(lm(f1 ~ f2 + f3, data = as.data.frame(eta)))
X_scaled <- scale(X)
beta_reg <- get_regression_coefs(X)
beta_bart <- get_bartlett_coefs(X)
beta_ae <- get_ae_coefs(X_scaled, n_factors, n_items, eta)
data.frame(
method = rep(c("regression", "bartlett", "ae"), each = 3),
term = rep(names(beta_true), 3),
bias = c(beta_reg - beta_true, beta_bart - beta_true, beta_ae - beta_true),
est = c(beta_reg, beta_bart, beta_ae),
true = rep(beta_true, 3)
)
}
library(torch)
library(MASS)
library(vegan)
library(lavaan)
library(parallel)
n_sim <- 200
n_cores <- max(1, detectCores() - 1)
cl <- makeCluster(n_cores)
clusterExport(cl, varlist = c(
"generate_measurement_model",
"simulate_once", "n_factors", "n_items", "n",
"gen_data", "autoencoder_nldae", "loss_nldae",
"get_ae_coefs", "get_regression_coefs",
"get_bartlett_coefs"
))
clusterEvalQ(cl, {
library(torch)
library(lavaan)
library(vegan)
})
results_list <- parLapply(cl, 1:n_sim, function(i) {
simulate_once()
})
results <- do.call(rbind, results_list)
stopCluster(cl)
saveRDS(results, file = "20250813_benchmark_results_beta_rmse.rds")
#results <- readRDS("20250813_benchmark_results_beta_rmse.rds")
results_summary <- results %>%
group_by(method, term) %>%
summarise(
bias = mean(bias),
var = var(est),
rmse = sqrt(mean((est - true)^2)),
.groups = "drop"
)
results_summary
# A tibble: 9 × 5
method term bias var rmse
<chr> <chr> <dbl> <dbl> <dbl>
1 ae (Intercept) -0.00000801 2.51e-33 0.0242
2 ae f2 -0.0535 1.91e- 3 0.0668
3 ae f3 -0.0538 1.93e- 3 0.0672
4 bartlett (Intercept) -0.00000801 2.87e-33 0.0242
5 bartlett f2 -0.0570 1.92e- 3 0.0705
6 bartlett f3 -0.0541 2.02e- 3 0.0689
7 regression (Intercept) -0.00000801 1.32e-33 0.0242
8 regression f2 0.0484 6.12e- 3 0.0845
9 regression f3 0.0446 6.32e- 3 0.0836Auto-encoder CFA - benchmark factor scores AE, regression en Bartlett
n_factors <- 3
n_items <- 5
n <- 1000
gen_data <- function(n = 1000, theta = 0.5, n_factors = 3, n_items = 5) {
Sigma <- matrix(theta, n_factors, n_factors)
diag(Sigma) <- 1
eta <- MASS::mvrnorm(n, mu = rep(0, n_factors), Sigma = Sigma)
Lambda <- matrix(0, nrow = n_factors * n_items, ncol = n_factors)
for (j in 1:n_factors) {
rows <- ((j - 1) * n_items + 1):(j * n_items)
Lambda[rows, j] <- runif(n_items, 0.7, 0.9)
}
eps <- matrix(rnorm(n * nrow(Lambda)), n, nrow(Lambda))
X <- eta %*% t(Lambda) + eps
prefix <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
colnames(X) <- unlist(lapply(prefix, function(p) paste0(p, 1:n_items)))
list(X = X, eta_true = eta)
}
autoencoder_nldae <- nn_module(
"NoiseLearningAutoEncoder",
initialize = function(n_factors, n_items, hidden = 128, dropout_p = 0.00) {
self$n_factors <- n_factors
self$n_items <- n_items
self$total_items <- n_factors * n_items
self$encoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(n_items, hidden), nn_gelu(),
nn_dropout(dropout_p), nn_linear(hidden, 1)
)
}))
self$decoders <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(1, hidden), nn_gelu(),
nn_dropout(dropout_p), nn_linear(hidden, n_items)
)
}))
self$noise_predictors <- nn_module_list(lapply(1:n_factors, function(i) {
nn_sequential(
nn_linear(n_items, hidden), nn_gelu(),
nn_linear(hidden, n_items)
)
}))
self$z_decoder <- nn_sequential(
nn_linear(n_factors, hidden), nn_gelu(),
nn_dropout(dropout_p), nn_linear(hidden, n_factors)
)
},
forward = function(x_true, x_noisy) {
factor_inputs <- list()
start_idx <- 1
for (i in 1:self$n_factors) {
end_idx <- start_idx + self$n_items - 1
factor_inputs[[i]] <- x_noisy[, start_idx:end_idx]
start_idx <- end_idx + 1
}
z_list <- lapply(1:self$n_factors, function(i) {
self$encoders[[i]](factor_inputs[[i]])$squeeze(2)
})
z <- torch_stack(z_list, dim = 2)
r_list <- lapply(1:self$n_factors, function(i) {
self$decoders[[i]](z_list[[i]]$unsqueeze(2))
})
x_recon <- torch_cat(r_list, dim = 2)
noise_hat_list <- lapply(1:self$n_factors, function(i) {
self$noise_predictors[[i]](factor_inputs[[i]])
})
x_noise_hat <- torch_cat(noise_hat_list, dim = 2)
z_recon <- self$z_decoder(z)
list(z = z, x_recon = x_recon, x_noise_hat = x_noise_hat, z_recon = z_recon)
}
)
loss_nldae <- function(x_true, x_noisy, output,
lambda_noise = 0.2, lambda_z = 0.0001, lambda_zrecon = 0.02) {
recon_loss <- nnf_mse_loss(output$x_recon, x_true)
noise_loss <- nnf_mse_loss(output$x_noise_hat, x_noisy - x_true)
z_penalty <- torch_mean(output$z^2)
z_recon_loss <- nnf_mse_loss(output$z_recon, output$z)
total_loss <- recon_loss + lambda_noise * noise_loss + lambda_z * z_penalty + lambda_zrecon * z_recon_loss
list(total = total_loss, recon = recon_loss)
}
get_ae_scores <- function(X_scaled, n_factors, n_items, eta_true) {
x_tensor <- torch_tensor(as.matrix(X_scaled), dtype = torch_float())
model <- autoencoder_nldae(n_factors, n_items)
optimizer <- optim_adam(model$parameters, lr = 1e-4, weight_decay = 1e-4)
best_loss <- Inf; counter <- 0
for (epoch in 1:2000) {
model$train(); optimizer$zero_grad()
noise <- torch_normal(mean = 0, std = 0.1, size = x_tensor$size())
x_noisy <- x_tensor + noise
output <- model(x_true = x_tensor, x_noisy = x_noisy)
loss_list <- loss_nldae(x_tensor, x_noisy, output)
loss_list$total$backward(); optimizer$step()
current_loss <- loss_list$total$item()
if (current_loss < best_loss - 1e-6) { best_loss <- current_loss; counter <- 0 } else { counter <- counter + 1 }
if (counter >= 20) break
}
out <- model(x_true = x_tensor, x_noisy = x_tensor)
z_hat <- as_array(out$z)
colnames(z_hat) <- paste0("f", 1:n_factors)
proc <- vegan::procrustes(X = eta_true, Y = z_hat, scale = TRUE)
z_rot <- proc$Yrot
z_rot
}
generate_measurement_model <- function(n_factors = 3, n_items = 5) {
factor_names <- paste0("f", 1:n_factors)
item_prefixes <- letters[24:(24 + n_factors - 1)] # x, y, z, ...
measurement_parts <- mapply(function(f, p) {
items <- paste0(p, 1:n_items)
paste(f, "=~", paste(items, collapse = " + "))
}, factor_names, item_prefixes)
model_string <- paste(measurement_parts, collapse = "\n")
return(model_string)
}
get_lavaan_scores <- function(X, method = c("regression", "Bartlett")) {
method <- match.arg(method)
model_txt <- generate_measurement_model(n_factors, n_items)
model_txt <- paste(model_txt, '\nf1 ~ 1 + f2 + f3')
fit <- lavaan::cfa(model_txt, data = X, se = "none")
lavPredict(fit, method = method)
}
# run simulatie
simulate_once <- function() {
data <- gen_data(n, theta = 0.5)
X <- data$X
eta <- data$eta_true
colnames(eta) <- paste0("f", 1:n_factors)
X_scaled <- scale(X)
z_reg <- get_lavaan_scores(X, method = "regression")
z_bart <- get_lavaan_scores(X, method = "Bartlett")
z_ae <- get_ae_scores(X_scaled, n_factors, n_items, eta)
data.frame(
method = rep(c("regression", "bartlett", "ae"), each = nrow(eta) * n_factors),
factor = rep(rep(paste0("f", 1:n_factors), each = nrow(eta)), 3),
error = c(as.vector(z_reg - eta),
as.vector(z_bart - eta),
as.vector(z_ae - eta))
)
}
###
set.seed(90210)
library(torch)
library(MASS)
library(vegan)
library(lavaan)
library(parallel)
n_sim <- 200
n_cores <- max(1, detectCores() - 1)
cl <- makeCluster(n_cores)
clusterExport(cl, varlist = c(
"generate_measurement_model",
"simulate_once", "n_factors", "n_items", "n",
"gen_data", "autoencoder_nldae", "loss_nldae",
"get_ae_scores", "get_lavaan_scores"
))
clusterEvalQ(cl, {
library(torch)
library(lavaan)
library(vegan)
})
results_list <- parLapply(cl, 1:n_sim, function(i) {
simulate_once()
})
results <- do.call(rbind, results_list)
stopCluster(cl)
saveRDS(results, file = "20250813_benchmark_results_fscores_rmse.rds")
#results <- readRDS("20250813_benchmark_results_fscores_rmse.rds")
results_summary <- results %>%
group_by(method, factor) %>%
summarise(
bias = mean(bias),
var = var(est),
rmse = sqrt(mean((est - true)^2)),
.groups = "drop"
)
# A tibble: 9 × 4
method factor bias var
<chr> <chr> <dbl> <dbl>
1 ae f1 0.000874 0.246
2 ae f2 -0.00137 0.245
3 ae f3 -0.000639 0.245
4 bartlett f1 0.000874 0.247
5 bartlett f2 -0.00137 0.245
6 bartlett f3 -0.000639 0.246
7 regression f1 0.000874 0.254
8 regression f2 -0.00137 0.256
9 regression f3 -0.000639 0.254Semi-supervised Auto-encoder CFA - benchmark beta’s AE, regression en Bartlett
# Benchmark AE-CFA vs Bartlett/Regression
library(torch)
library(lavaan)
library(MASS)
library(dplyr)
library(future.apply)
## Data genereren
# mooie multivariate factor scores
gen_data <- function(n = 1000L, theta = 0.5, n_factors = 3, items_per_factor = 5) {
Sigma <- matrix(theta, n_factors, n_factors)
diag(Sigma) <- 1
eta <- MASS::mvrnorm(n, mu = rep(0, n_factors), Sigma = Sigma)
colnames(eta) <- paste0("f", 1:n_factors)
n_items <- n_factors * items_per_factor
Lambda <- matrix(0, nrow = n_items, ncol = n_factors)
for (j in 1:n_factors) {
rows <- ((j - 1) * items_per_factor + 1):(j * items_per_factor)
Lambda[rows, j] <- runif(items_per_factor, 0.7, 0.9)
}
eps <- matrix(rnorm(n * n_items), n, n_items)
X <- eta %*% t(Lambda) + eps
colnames(X) <- paste0("x", 1:n_items)
list(X = X, eta_true = eta)
}
# Items zijn niet-lineair in hun factor(en) (tanh)
# Elke itemset bevat expliciete cross- en INTERACTIE-termen
# Ruis is heteroscedastisch
gen_data_interact_misfit <- function(n = 1000L,
n_factors = 3L,
items_per_factor = 5L,
phi = 0.3,
main_load = c(0.7, 1.0),
cross_load = c(0.25, 0.45),
inter_load = c(0.20, 0.35), # range interactie-loading
noise_base = 0.25,
noise_hetero = 0.20,
seed = NULL) {
if (!is.null(seed)) set.seed(seed)
# Latente factoren met correlatie ~ phi
Sigma <- matrix(phi, n_factors, n_factors); diag(Sigma) <- 1
eta <- MASS::mvrnorm(n, mu = rep(0, n_factors), Sigma = Sigma)
colnames(eta) <- paste0("f", 1:n_factors)
# Items genereren met niet-lineariteit + cross + INTERACTIE in de METING
# f1-items bevatten o.a. (eta2 * eta3), f2-items (eta1 * eta3), f3-items (eta1 * eta2)
X <- NULL
for (j in 1:n_factors) {
others <- setdiff(1:3, j) # de twee andere factoren
k <- others[1]; l <- others[2]
# trek coefs per item
lam_main <- runif(items_per_factor, main_load[1], main_load[2])
lam_cross1 <- runif(items_per_factor, cross_load[1], cross_load[2])
lam_cross2 <- runif(items_per_factor, cross_load[1], cross_load[2])
lam_inter <- runif(items_per_factor, inter_load[1], inter_load[2])
items <- vapply(1:items_per_factor, function(i) {
# signaal: hoofd + cross + INTERACTIE
signal <- lam_main[i] * eta[, j] +
lam_cross1[i]* eta[, k] +
lam_cross2[i]* eta[, l] +
lam_inter[i] * (eta[, k] * eta[, l]) # maakt lineaire CFA mis-specified
# niet-lineaire meetfunctie + heteroscedastische ruis
mu <- tanh(signal)
sdev <- noise_base + noise_hetero * (abs(eta[, k]) + abs(eta[, l]))/2
y <- mu + rnorm(n, 0, sdev)
# standaardiseer item
as.numeric(scale(y))
}, numeric(n))
colnames(items) <- paste0("x", ((j - 1) * items_per_factor + 1):(j * items_per_factor))
X <- cbind(X, items)
}
X <- as.data.frame(X)
list(X = X, eta_true = eta)
}
# ---------- lavaan model syntax ----------
make_model_txt <- function(colnames_X, n_factors, items_per_factor) {
paste0(
paste0("f", 1:n_factors, " =~ ",
sapply(1:n_factors, function(j)
paste0(colnames_X[((j - 1) * items_per_factor + 1):(j * items_per_factor)],
collapse = " + ")),
collapse = "\n")
)
}
# CFA-geïnspireerde AE met extra constraints en penalties
AutoencoderSoftMask <- nn_module(
"AutoencoderSoftMask",
initialize = function(n_items, n_factors, mask, items_per_factor) {
# Encoder
# Doel: reduceer de hoge-dimensionele itemdata naar een klein aantal latente factoren.
# Lineaire mapping, zoals in CFA (items × loadings = factoren).
# Geen bias, omdat CFA-latente variabelen gemiddeld nul zijn (identificatie).
self$enc <- nn_linear(n_items, n_factors, bias = FALSE)
# Decoder: factoren terug naar ietms reconstureren
# Dit is een neuraal netwerk met meerdere lagen die lineaire transformaties en
# niet-lineariteiten combineren om complexe patronen te leren.
# relu() laat toe om niet-lineaire patronen te leren en staat interacties toe
# Een puur lineaire decoder zou exact een CFA zijn.
self$dec <- nn_sequential(
nn_linear(n_factors, 32),
nn_relu(),
nn_linear(32, n_items)
)
# leert itemvarianties
# Niet elk item is even betrouwbaar. Hier leeft de AE de meetfout.
self$log_sigma2 <- nn_parameter(torch_zeros(n_items))
# Soft mask (penalty i.p.v. hard zero)
# Dit laat kleine cross-loadings toe, maar houdt ze klein door een penalty.
self$Lambda <- nn_parameter(torch_randn(n_items, n_factors) * 0.05)
# Geeft aan welke items horen bij welke factor (CFA-structuur).
self$mask <- mask
self$n_factors <- n_factors
self$items_per_factor <- items_per_factor
},
encode = function(x) self$enc(x),
decode = function(z) self$dec(z),
forward = function(x_in) {
z <- self$encode(x_in)
x_hat <- self$decode(z)
list(z = z, x_hat = x_hat)
},
# Straft elke loading die buiten de CFA-structuur valt.
cross_penalty = function() {
mask_inv <- 1 - self$mask
(self$Lambda * mask_inv)$pow(2)$mean()
},
# houd encoder gewichten klein
# Zorgt dat de encoder stabieler wordt en minder gevoelig voor kleine ruis in de input.
jacobian_penalty = function() {
self$enc$weight$pow(2)$sum()
},
project_params = function() {
with_no_grad({
# Identificatie: zet eerste item per factor op 1 !!
for (j in 1:self$n_factors) {
i <- (j - 1) * self$items_per_factor + 1L
self$Lambda[i, j]$fill_(1)
}
})
}
)
# ---------- Trainer met variance-reducerende losses ----------
# Train een autoencoder die latente factoren (z) leert uit itemdata.
# Gebruik meerdere loss-termen (reconstructie, consistency, CFA-structuur, regularisatie).
# Kalibreer de latente factoren achteraf, zodat ze vergelijkbaar zijn met CFA-factoren.
# Geef uiteindelijk regressiecoëfficiënten terug (f1 ~ f2 * f3).
get_ae_coefs_cfa <- function(
X, n_factors, items_per_factor,
n_epochs = 2000,
lr = 3e-3,
weight_decay = 1e-4,
warmup_T = 10, # teacher snel loslaten
penalty_T = 10, # cross-loading penalty ook snel loslaten
cov_T = 100,
keep_p_start = 0.7, # sterke denoising in het begin
keep_p_end = 1.0, # naar 1.0 tegen het einde
lambda_cons = 1.0,
lambda_jac = 1e-3, # houd deze laag om bias te voorkomen
lambda_cross0 = 1e-3,
lambda_cov0 = 1e-1,
recolor_gamma = 0.2
) {
n_items <- ncol(X)
# Eerst standaardiseren zoals bij CFA
Xs <- scale(X)
x_tensor <- torch_tensor(as.matrix(Xs), dtype = torch_float())
# CFA teacher: Zorgt dat de AE niet willekeurig start, maar in de eerste epochs lijkt op CFA.
# Daarna mag het vrijer. De meetfout wordt niet van de teacher geleerd, maar door de AE zelf.
model_txt <- make_model_txt(colnames(X), n_factors, items_per_factor)
fit <- lavaan::cfa(model_txt, data = as.data.frame(X), se = "none", std.lv = TRUE)
# dit zijn de teacher scores voor een korte warm-up.
z_teacher <- scale(lavPredict(fit, method = "regression", transform = TRUE))
z_teacher_t <- torch_tensor(z_teacher, dtype = torch_float())
# geschatte covariantiematrix van de factoren -> structuurdoel voor AE.
Phi_hat <- lavInspect(fit, "cov.lv")
# Welke items bij welke factor horen.
mask_mat <- matrix(0, nrow = n_items, ncol = n_factors)
for (j in 1:n_factors) {
mask_mat[((j - 1) * items_per_factor + 1):(j * items_per_factor), j] <- 1
}
mask <- torch_tensor(mask_mat, dtype = torch_float())
# model + opt
model <- AutoencoderSoftMask(n_items, n_factors, mask, items_per_factor)
# Gebruik Adam optimizer met weight decay (L2-regularisatie).
opt <- optim_adam(model$parameters, lr = lr, weight_decay = weight_decay)
# helpers
# cov2cor <- function(covm) {
# d <- as.numeric(diag(covm))
# Dm <- diag(1 / sqrt(pmax(d, 1e-8)))
# Dm %*% covm %*% Dm
# }
## EMA (Exponential Moving Average) van encoder
# Maakt de uiteindelijke factor-scores stabieler en minder gevoelig voor ruis.
# EMA is in feite een gemiddeld gewicht over de laatste ~100 epochs.
# 1 / (1 - ema_beta) = 1 / (1 - 0.99) = ~ 100 stappen
# Zonder EMA kunnen encodergewichten springen door ruis of stochastische updates.
# Een ema_beta van 0.99 geeft een stabiel gemiddelde van de encodergewichten en zorgt voor
# consistente factor_scores die niet te gevoelig zijn voor meetfouten.
ema_enc_w <- model$enc$weight$detach()$clone()
ema_beta <- 0.99
update_ema <- function() {
with_no_grad({
ema_enc_w$mul_(ema_beta)$add_((1 - ema_beta) * model$enc$weight$detach())
})
}
for (epoch in 1:n_epochs) {
model$train(); opt$zero_grad()
## denoising stukje: De AE moet leren om gemaskeerde of vervormde inputs te reconstrueren.
# keep_p = kans dat een item behouden blijft (dus niet gemaskeerd)
# start = 0.7 in het begin blijft 70% van de data over
# end = 1, tegen het eind blijft alles behouden
# het groeit lineair met de epoch. Dus veel ruis aan het begin, steeds minder
# ruis naarmate de training vordert.
keep_p <- keep_p_start + (keep_p_end - keep_p_start) * (epoch / n_epochs)
# twee onafhankelijke versies
# trekt voor elk element een 0 of 1, met kans keep_p op 1.
m1 <- torch_bernoulli(torch_full_like(x_tensor, keep_p))
m2 <- torch_bernoulli(torch_full_like(x_tensor, keep_p))
# input waarbij sommige items random op 0 zijn gezet (gemaskeerd).
x1 <- x_tensor * m1
x2 <- x_tensor * m2
# forward twee augmentaties (voor consistency)
out1 <- model(x1)
out2 <- model(x2)
# we gebruikten de log zodat het model vrij kan leren van -Inf tot +Inf
# hier zetten we de log-transformaties waar terug
sigma2 <- torch_exp(model$log_sigma2)
# beperk de waarden van min tot max. Te kleine waarden varianties leden tot enorme NLL-waarden.
# Dit houdt de training stabiel
sigma2 <- torch_clamp(sigma2, min = 1e-4, max = 1e+2)
# Meet reconstructiefout, maar weegt die per item met zijn variantie (sigma2).
# Equivalent van CFA met item-specifieke errorvarianties.
resid <- x_tensor - out1$x_hat
nll <- 0.5 * torch_mean((resid * resid) / sigma2 + torch_log(sigma2))
# zorgt voor stabiele factoren, ook als de input wat ruis heeft
cons_loss <- nnf_mse_loss(out1$z, out2$z)
z_all <- as.matrix(model$encode(x_tensor)$detach())
zc <- scale(z_all, center = TRUE, scale = FALSE)
Sz <- cov(zc)
Cz <- stats::cov2cor(Sz)
# Correlaties van AE-factoren (z) moeten lijken op CFA-covariantie (Phi_hat).
cov_pen <- mean((Cz - Phi_hat)^2)
w_cov <- lambda_cov0 * max(0, 1 - epoch / cov_T)
# Houd kruisladingen klein, behoud cfa structuur
w_cross <- lambda_cross0 * (0.2 + 0.8 * max(0, 1 - epoch / penalty_T)) # 20% blijft
cross_pen <- model$cross_penalty()
# houd encodergwichte klein (robuuster)
jac_pen <- model$jacobian_penalty()
# Teacher warm-up (dit neemt snel af en daarna mag de AE vrij leren)
w_sup <- max(0, 1 - epoch / warmup_T)
sup_loss <- nnf_mse_loss(out1$z, z_teacher_t)
# Hier combineren we alle termen
loss <- nll +
lambda_cons * cons_loss +
w_cov * torch_tensor(cov_pen, dtype = torch_float()) +
w_cross * cross_pen +
lambda_jac * jac_pen +
w_sup * sup_loss
loss$backward()
nn_utils_clip_grad_norm_(model$parameters, max_norm = 1.0)
opt$step()
model$project_params()
update_ema()
}
with_no_grad({
enc_w_backup <- model$enc$weight$detach()$clone()
model$enc$weight$copy_(ema_enc_w)
z_hat <- as.matrix(model$encode(x_tensor)$detach())
colnames(z_hat) <- paste0("f", 1:n_factors)
# Recoloring: pas een lineaire transformatie toe zodat de factorcovariantie dichter bij Phi_hat ligt.
# Resultaat: gekalibreerde factor-scores (z_cal).
Sz <- cov(scale(z_hat, center = TRUE, scale = FALSE))
Phi_star <- (1 - recolor_gamma) * Phi_hat + recolor_gamma * Sz
es <- eigen(Sz)
Sz_mhalf <- es$vectors %*% diag(1 / sqrt(pmax(es$values, 1e-8))) %*% t(es$vectors)
ep <- eigen(Phi_star)
Phi_phalf <- ep$vectors %*% diag(sqrt(pmax(ep$values, 1e-8))) %*% t(ep$vectors)
A <- Sz_mhalf %*% Phi_phalf
z_cal <- z_hat %*% A
colnames(z_cal) <- paste0("f", 1:n_factors)
model$enc$weight$copy_(enc_w_backup) # zet terug (voor eventueel hergebruik)
# Gebruik gekalibreerde scores voor regressie
coef(lm(f1 ~ f2 * f3, data = as.data.frame(z_cal)))
})
}
# benchmark code
simulate_once <- function(datagen_fun, n = 1000, n_factors = 3, items_per_factor = 5) {
data <- datagen_fun(n = n, n_factors = n_factors, items_per_factor = items_per_factor)
X <- data$X; eta <- data$eta_true
colnames(eta) <- paste0("f", 1:n_factors)
beta_true <- coef(lm(f1 ~ f2 * f3, data = as.data.frame(eta)))
beta_ae <- get_ae_coefs_cfa(X, n_factors, items_per_factor,
n_epochs = 2000, lr = 5e-3,
weight_decay = 1e-4,
warmup_T = 10, penalty_T = 10,
lambda_cons = 1.0,
lambda_jac = 1e-3,
lambda_cross0 = 1e-3,
lambda_cov0 = 1e-2,
recolor_gamma = 0.2)
model_txt <- make_model_txt(colnames(X), n_factors, items_per_factor)
fit <- lavaan::cfa(model_txt, data = as.data.frame(X), se = "none", std.lv = TRUE)
beta_reg <- coef(lm(f1 ~ f2 * f3, data = as.data.frame(lavPredict(fit, method = "regression", transform = TRUE))))
beta_bart <- coef(lm(f1 ~ f2 * f3, data = as.data.frame(lavPredict(fit, method = "bartlett", transform = TRUE))))
data.frame(
method = rep(c("ae_cfa", "regression", "bartlett"), each = length(beta_true)),
term = rep(names(beta_true), 3),
bias = c(beta_ae - beta_true, beta_reg - beta_true, beta_bart - beta_true),
est = c(beta_ae, beta_reg, beta_bart),
true = rep(beta_true, 3)
)
}
# Run benchmark
set.seed(123)
datagen_fun <- gen_data
# tmp <- datagen_fun(n=1000, n_factors=3, items_per_factor=5)
# X <- tmp$X
# model_txt <- make_model_txt(colnames(X), n_factors = 3, items_per_factor = 5)
# fit_sam <- lavaan::sam(model_txt, data = as.data.frame(X), se = "none", std.lv = TRUE)
# summary(fit_sam)
library(progressr)
R <- 100
plan(multisession, workers = 14)
handlers(global = TRUE)
res_list <- with_progress({
p <- progressor(steps = R)
future_lapply(
seq_len(R),
function(i) {
out <- simulate_once(datagen_fun)
p(message = sprintf("sim %d/%d", i, R))
out
},
future.seed = TRUE
)
})
sink()
results <- bind_rows(res_list) %>%
group_by(method, term) %>%
summarise(
bias = mean(bias),
var = var(est),
rmse = sqrt(mean((est - true)^2)),
.groups = "drop"
)
print(results)
## multivariaat normaal
method term bias var rmse
<chr> <chr> <dbl> <dbl> <dbl>
3 ae_cfa f2:f3 -0.00196 0.000752 0.0335
7 bartlett f2:f3 0.00313 0.000400 0.0218
11 regression f2:f3 0.00318 0.000400 0.0218
# benchmark voor hyperparamters
print(n=100, summary_grid_gen_data)
term config_id best_bias mean_bias worst_bias best_var mean_var worst_var best_rmse mean_rmse worst_rmse warmup_T penalty_T lambda_cons lambda_jac lambda_cov0 recolor_gamma
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 f2:f3 62 -0.0155 -0.0155 -0.0155 0.000866 0.000866 0.000866 0.0335 0.0335 0.0335 50 10 1 0.001 0.1 0.1
2 f2:f3 133 -0.0143 -0.0143 -0.0143 0.00228 0.00228 0.00228 0.0509 0.0509 0.0509 10 10 1 0.01 0.01 0.2
3 f2:f3 22 -0.0112 -0.0112 -0.0112 0.00167 0.00167 0.00167 0.0509 0.0509 0.0509 50 100 1 0.001 0.01 0.1
4 f2:f3 18 -0.0106 -0.0106 -0.0106 0.00136 0.00136 0.00136 0.0424 0.0424 0.0424 50 50 1 0.001 0.01 0.1
5 f2:f3 173 -0.0106 -0.0106 -0.0106 0.00326 0.00326 0.00326 0.0602 0.0602 0.0602 10 50 0.5 0.01 0.1 0.2
6 f2:f3 166 -0.00973 -0.00973 -0.00973 0.00358 0.00358 0.00358 0.0682 0.0682 0.0682 50 100 1 0.001 0.1 0.2
7 f2:f3 142 -0.00889 -0.00889 -0.00889 0.00169 0.00169 0.00169 0.0415 0.0415 0.0415 50 100 1 0.01 0.01 0.2
8 f2:f3 146 -0.00855 -0.00855 -0.00855 0.000843 0.000843 0.000843 0.0349 0.0349 0.0349 50 10 0.5 0.001 0.1 0.2
9 f2:f3 130 -0.00846 -0.00846 -0.00846 0.000934 0.000934 0.000934 0.0338 0.0338 0.0338 50 100 0.5 0.01 0.01 0.2
10 f2:f3 169 -0.00821 -0.00821 -0.00821 0.00275 0.00275 0.00275 0.0592 0.0592 0.0592 10 10 0.5 0.01 0.1 0.2
11 f2:f3 115 -0.00799 -0.00799 -0.00799 0.000535 0.000535 0.000535 0.0226 0.0226 0.0226 100 50 1 0.001 0.01 0.2
12 f2:f3 118 -0.00782 -0.00782 -0.00782 0.00113 0.00113 0.00113 0.0351 0.0351 0.0351 50 100 1 0.001 0.01 0.2
Call:
lm(formula = mean_bias ~ warmup_T + penalty_T + lambda_cons +
lambda_jac + lambda_cov0 + recolor_gamma, data = summary_grid_gen_data)
Residuals:
Min 1Q Median 3Q Max
-0.0154069 -0.0022082 -0.0001164 0.0026887 0.0124764
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.289e-05 1.752e-03 0.024 0.9805
warmup_T -1.439e-06 4.865e-06 -0.296 0.7678
penalty_T 1.646e-05 9.390e-06 1.753 0.0812 .
lambda_cons -1.452e-03 1.383e-03 -1.050 0.2950
lambda_jac -4.859e-02 7.683e-02 -0.632 0.5279
lambda_cov0 1.211e-02 7.683e-03 1.577 0.1166
recolor_gamma 6.097e-04 6.914e-03 0.088 0.9298
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.00479 on 185 degrees of freedom
Multiple R-squared: 0.03725, Adjusted R-squared: 0.006025
F-statistic: 1.193 on 6 and 185 DF, p-value: 0.3119
# Conclusie: hyperparamters minder kritisch
## Niet normaal verdeelde data
# Items zijn niet-lineair in hun factor(en) (tanh)
# Elke itemset bevat expliciete cross- en INTERACTIE-termen
# Ruis is heteroscedastisch
method term bias var rmse
<chr> <chr> <dbl> <dbl> <dbl>
3 ae_cfa f2:f3 0.0196 0.00619 0.0826
7 bartlett f2:f3 0.146 0.000754 0.149
11 regression f2:f3 0.146 0.000756 0.149
# benchmark
> print(n=100, summary_grid_gen_data_interact_misfit)
# A tibble: 192 × 17
term config_id best_bias mean_bias worst_bias best_var mean_var worst_var best_rmse mean_rmse worst_rmse warmup_T penalty_T lambda_cons lambda_jac lambda_cov0 recolor_gamma
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 f2:f3 129 -0.0629 -0.0629 -0.0629 0.162 0.162 0.162 0.402 0.402 0.402 10 100 0.5 0.01 0.01 0.2
2 f2:f3 153 -0.0370 -0.0370 -0.0370 0.0242 0.0242 0.0242 0.165 0.165 0.165 10 100 0.5 0.001 0.1 0.2
3 f2:f3 73 -0.0364 -0.0364 -0.0364 0.111 0.111 0.111 0.337 0.337 0.337 10 10 0.5 0.01 0.1 0.1
4 f2:f3 149 -0.0335 -0.0335 -0.0335 0.0352 0.0352 0.0352 0.194 0.194 0.194 10 50 0.5 0.001 0.1 0.2
5 f2:f3 101 -0.0239 -0.0239 -0.0239 0.0248 0.0248 0.0248 0.156 0.156 0.156 10 50 0.5 0.001 0.01 0.2
6 f2:f3 9 -0.0230 -0.0230 -0.0230 0.0657 0.0657 0.0657 0.262 0.262 0.262 10 100 0.5 0.001 0.01 0.1
7 f2:f3 33 -0.0198 -0.0198 -0.0198 0.325 0.325 0.325 0.560 0.560 0.560 10 100 0.5 0.01 0.01 0.1
8 f2:f3 69 -0.0116 -0.0116 -0.0116 0.0174 0.0174 0.0174 0.135 0.135 0.135 10 100 1 0.001 0.1 0.1
9 f2:f3 53 -0.0109 -0.0109 -0.0109 0.00518 0.00518 0.00518 0.0759 0.0759 0.0759 10 50 0.5 0.001 0.1 0.1
10 f2:f3 5 -0.00588 -0.00588 -0.00588 0.0416 0.0416 0.0416 0.201 0.201 0.201 10 50 0.5 0.001 0.01 0.1
11 f2:f3 1 -0.00558 -0.00558 -0.00558 0.0501 0.0501 0.0501 0.233 0.233 0.233 10 10 0.5 0.001 0.01 0.1
12 f2:f3 125 -0.00405 -0.00405 -0.00405 0.131 0.131 0.131 0.357 0.357 0.357 10 50 0.5 0.01 0.01 0.2
13 f2:f3 173 -0.00311 -0.00311 -0.00311 0.0569 0.0569 0.0569 0.239 0.239 0.239 10 50 0.5 0.01 0.1 0.2
14 f2:f3 49 -0.00176 -0.00176 -0.00176 0.0298 0.0298 0.0298 0.172 0.172 0.172 10 10 0.5 0.001 0.1 0.1
15 f2:f3 105 -0.00147 -0.00147 -0.00147 0.0262 0.0262 0.0262 0.163 0.163 0.163 10 100 0.5 0.001 0.01 0.2
16 f2:f3 65 0.00189 0.00189 0.00189 0.0309 0.0309 0.0309 0.175 0.175 0.175 10 50 1 0.001 0.1 0.1
17 f2:f3 157 0.00442 0.00442 0.00442 0.0257 0.0257 0.0257 0.170 0.170 0.170 10 10 1 0.001 0.1 0.2
18 f2:f3 161 0.00751 0.00751 0.00751 0.0336 0.0336 0.0336 0.182 0.182 0.182 10 50 1 0.001 0.1 0.2
19 f2:f3 181 0.0150 0.0150 0.0150 0.0917 0.0917 0.0917 0.301 0.301 0.301 10 10 1 0.01 0.1 0.2
Call:
lm(formula = mean_bias ~ warmup_T + penalty_T + lambda_cons +
lambda_jac + lambda_cov0 + recolor_gamma, data = summary_grid_gen_data_interact_misfit)
Residuals:
Min 1Q Median 3Q Max
-0.12299 -0.01240 0.00179 0.01627 0.08641
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.256e-02 1.008e-02 1.246 0.2144
warmup_T 4.177e-04 2.798e-05 14.927 <2e-16 ***
penalty_T -1.010e-05 5.401e-05 -0.187 0.8518
lambda_cons 1.587e-02 7.954e-03 1.995 0.0475 *
lambda_jac 4.176e+00 4.419e-01 9.450 <2e-16 ***
lambda_cov0 -4.808e-03 4.419e-02 -0.109 0.9135
recolor_gamma -2.659e-02 3.977e-02 -0.668 0.5047
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.02755 on 185 degrees of freedom
Multiple R-squared: 0.6312, Adjusted R-squared: 0.6192
F-statistic: 52.77 on 6 and 185 DF, p-value: < 2.2e-16
# Conclusie: De AE is redelijk hyperparameter onafhankelijk. warmup_T en lambda_jac zijn
# wel bepalend voor bias. Houd ze beide klein.