为什么用 LMM?
重复测量/随访数据中,同一受试者多次观测值相关,普通线性回归假设(独立同分布)被破坏。LMM
通过随机截距/随机斜率捕捉个体间差异与轨迹差异。
本案例问题
评估基线尿蛋白(正常/微量/大量)对 GFR
基线水平与年下降率(斜率)的影响,并校正年龄与性别。
关键解释
time
的系数 =
“参考组(正常蛋白)平均斜率”。time:Micro
/time:Macro
交互 =
与参考组斜率的差值。Micro
/Macro
主效应 = 在
time = 0
(基线)时,与参考组截距差。(1 + time | patient)
:允许不同病人有不同的起点和下降速度。need <- c(
"tidyverse","nlme","lme4","broom","broom.mixed",
"parameters","performance","ggeffects","patchwork","gt","scales"
)
for (p in need) {
if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
library(p, character.only = TRUE)
}
theme_set(theme_minimal(base_size = 12))
# 读入数据:字段含 patient, visit, time, GFR, age, gender, micro, macro
dat <- readr::read_csv(params$data_path, show_col_types = FALSE)
# 基础整理:构造三水平因子,便于作图
dat <- dat %>%
mutate(
prot = case_when(
micro == 1 ~ "Micro",
macro == 1 ~ "Macro",
TRUE ~ "Normal"
),
prot = factor(prot, levels = c("Normal","Micro","Macro")),
gender = factor(gender, levels = c(0,1), labels = c("Male","Female"))
)
# 快速质控
summary(select(dat, time, GFR, age))
## time GFR age
## Min. :0.000 Min. : -5.966 Min. :20.00
## 1st Qu.:1.164 1st Qu.: 30.793 1st Qu.:48.89
## Median :2.986 Median : 46.164 Median :60.63
## Mean :3.161 Mean : 46.518 Mean :59.36
## 3rd Qu.:5.013 3rd Qu.: 60.888 3rd Qu.:69.65
## Max. :8.185 Max. :109.874 Max. :90.00
dat %>% count(patient, name = "n_visit") %>% summarise(n= n(), mean_visit = mean(n_visit), min = min(n_visit), max = max(n_visit))
dat %>% summarise(miss_GFR = sum(is.na(GFR)), n = n())
nlme::lme
fit_lme <- nlme::lme(
fixed = GFR ~ time + age + gender + micro + macro + time:micro + time:macro,
random = ~ 1 + time | patient, # 随机截距+随机斜率
method = "REML",
data = dat,
control = nlme::lmeControl(opt = "optim")
)
summary(fit_lme)
## Linear mixed-effects model fit by REML
## Data: dat
## AIC BIC logLik
## 9532.979 9595.649 -4754.489
##
## Random effects:
## Formula: ~1 + time | patient
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 10.573731 (Intr)
## time 3.623581 -0.054
## Residual 4.902855
##
## Fixed effects: GFR ~ time + age + gender + micro + macro + time:micro + time:macro
## Value Std.Error DF t-value p-value
## (Intercept) 83.40219 3.274726 1175 25.468451 0.0000
## time -1.74107 0.417145 1175 -4.173768 0.0000
## age -0.28751 0.052077 195 -5.520829 0.0000
## genderFemale -2.61576 1.698787 195 -1.539779 0.1252
## micro -18.39562 1.895459 195 -9.705100 0.0000
## macro -26.56474 1.916902 195 -13.858159 0.0000
## time:micro -2.92360 0.642242 1175 -4.552177 0.0000
## time:macro -3.09587 0.673877 1175 -4.594122 0.0000
## Correlation:
## (Intr) time age gndrFm micro macro time:micr
## time -0.039
## age -0.914 0.001
## genderFemale -0.018 0.000 -0.136
## micro -0.169 0.066 -0.085 -0.002
## macro -0.244 0.066 0.014 -0.108 0.425
## time:micro 0.026 -0.650 -0.001 -0.002 -0.113 -0.042
## time:macro 0.023 -0.619 0.001 -0.004 -0.041 -0.122 0.402
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -3.16211262 -0.59971196 -0.01943421 0.58074758 3.14149972
##
## Number of Observations: 1378
## Number of Groups: 200
parameters::model_parameters(fit_lme, effects = "fixed", ci = 0.95)
VarCorr(fit_lme)
## patient = pdLogChol(1 + time)
## Variance StdDev Corr
## (Intercept) 111.80379 10.573731 (Intr)
## time 13.13034 3.623581 -0.054
## Residual 24.03799 4.902855
解释提醒:
- 参考组 = Normal(micro=0 & macro=0
)
- 参考组斜率 =time
的系数
- Micro 斜率 =time + time:micro
- Macro 斜率 =time + time:macro
- Micro 基线差 =micro
的系数;Macro 基线差 =macro
的系数(均在 age、gender 校正后)
lme4::lmer
(因子写法,更利于可视化)fit_lmer <- lme4::lmer(
GFR ~ time * prot + age + gender + (1 + time | patient),
data = dat, REML = TRUE
)
summary(fit_lmer)
## Linear mixed model fit by REML ['lmerMod']
## Formula: GFR ~ time * prot + age + gender + (1 + time | patient)
## Data: dat
##
## REML criterion at convergence: 9509
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.16211 -0.59971 -0.01943 0.58075 3.14150
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## patient (Intercept) 111.80 10.574
## time 13.13 3.624 -0.05
## Residual 24.04 4.903
## Number of obs: 1378, groups: patient, 200
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 83.40217 3.27471 25.469
## time -1.74107 0.41714 -4.174
## protMicro -18.39562 1.89545 -9.705
## protMacro -26.56474 1.91690 -13.858
## age -0.28751 0.05208 -5.521
## genderFemale -2.61577 1.69878 -1.540
## time:protMicro -2.92359 0.64224 -4.552
## time:protMacro -3.09586 0.67387 -4.594
##
## Correlation of Fixed Effects:
## (Intr) time protMicr protMacr age gndrFm time:protMicr
## time -0.039
## protMicro -0.169 0.066
## protMacro -0.244 0.066 0.425
## age -0.914 0.001 -0.085 0.014
## genderFemal -0.018 0.000 -0.002 -0.108 -0.136
## time:protMicr 0.026 -0.650 -0.113 -0.042 -0.001 -0.002
## time:protMacr 0.023 -0.619 -0.041 -0.122 0.001 -0.004 0.402
# 展示固定效应(含 95% CI 与 p 值)
parameters::model_parameters(fit_lmer, effects = "fixed", ci = 0.95)
tid <- broom.mixed::tidy(fit_lme, effects = "fixed", conf.int = TRUE)
beta_time <- tid$estimate[tid$term == "time"]
beta_time_mic <- tid$estimate[tid$term == "time:micro"]
beta_time_mac <- tid$estimate[tid$term == "time:macro"]
# 三组斜率(yearly change of GFR)
slope_normal <- beta_time
slope_micro <- beta_time + beta_time_mic
slope_macro <- beta_time + beta_time_mac
# 基线差(time=0 时截距差)
b0_micro <- tid %>% dplyr::filter(term == "micro") %>% dplyr::transmute(beta = estimate, lo = conf.low, hi = conf.high)
b0_macro <- tid %>% dplyr::filter(term == "macro") %>% dplyr::transmute(beta = estimate, lo = conf.low, hi = conf.high)
res_tab <- tibble::tibble(
指标 = c("正常组斜率","微量组-正常斜率差","大量组-正常斜率差","微量组基线差","大量组基线差"),
估计 = c(slope_normal, beta_time_mic, beta_time_mac, b0_micro$beta, b0_macro$beta),
`95%CI下限` = c(
tid$conf.low[tid$term=="time"],
tid$conf.low[tid$term=="time:micro"],
tid$conf.low[tid$term=="time:macro"],
b0_micro$lo, b0_macro$lo
),
`95%CI上限` = c(
tid$conf.high[tid$term=="time"],
tid$conf.high[tid$term=="time:micro"],
tid$conf.high[tid$term=="time:macro"],
b0_micro$hi, b0_macro$hi
)
)
res_tab %>% gt::gt() %>% gt::fmt_number(2:4, decimals = 2)
指标 | 估计 | 95%CI下限 | 95%CI上限 |
---|---|---|---|
正常组斜率 | −1.74 | −2.56 | −0.92 |
微量组-正常斜率差 | −2.92 | −4.18 | −1.66 |
大量组-正常斜率差 | −3.10 | −4.42 | −1.77 |
微量组基线差 | −18.40 | −22.13 | −14.66 |
大量组基线差 | −26.56 | −30.35 | −22.78 |
# 使用 lmer + ggeffects 生成边际预测及 95% CI(年龄/性别固定在样本均值/众数)
pred <- ggeffects::ggpredict(
fit_lmer,
terms = c("time [all]", "prot"),
type = "fixed"
)
p_interact <- ggplot(pred, aes(x = x, y = predicted, ymin = conf.low, ymax = conf.high, color = group, fill = group)) +
geom_ribbon(alpha = 0.15, linewidth = 0) +
geom_line(linewidth = 1) +
scale_x_continuous("随访时间(年)", breaks = scales::pretty_breaks()) +
scale_y_continuous("预测 GFR", breaks = scales::pretty_breaks()) +
labs(color = "尿蛋白组别", fill = "尿蛋白组别", title = "时间×尿蛋白 交互的边际预测曲线(校正年龄、性别)") +
theme(legend.position = "top")
p_interact
set.seed(1)
id_tbl <- dat %>% dplyr::distinct(patient)
n_ids <- nrow(id_tbl)
ids <- id_tbl %>%
dplyr::slice_sample(n = min(30, n_ids)) %>%
dplyr::pull(patient)
sub <- dat %>% dplyr::filter(patient %in% ids)
# 组均值轨迹
mean_traj <- dat %>%
dplyr::group_by(prot, time) %>%
dplyr::summarise(mean_GFR = mean(GFR, na.rm = TRUE), .groups = "drop")
p_spa <- ggplot(sub, aes(time, GFR, group = patient, color = prot)) +
geom_line(alpha = 0.35) +
geom_line(
data = mean_traj,
mapping = aes(x = time, y = mean_GFR, color = prot, group = prot),
linewidth = 1.2,
inherit.aes = FALSE # ← 关键
) +
labs(x = "时间(年)", y = "GFR", color = "尿蛋白组别",
title = "个体轨迹(浅)与组均值轨迹(粗)") +
theme(legend.position = "top")
p_spa
# 提取随机效应(含条件方差)
re_list <- lme4::ranef(fit_lmer, condVar = TRUE)
re_pat <- re_list$patient # data.frame: (Intercept), time
postVar <- attr(re_pat, "postVar") # 数组 [nRE x nRE x n_patients],此处 nRE=2
# 各随机效应的标准误
se_intercept <- sqrt(postVar[1, 1, ])
se_time <- sqrt(postVar[2, 2, ])
z <- qnorm(0.975)
library(tibble); library(dplyr); library(ggplot2)
slopes <- re_pat %>%
rownames_to_column("patient") %>%
as_tibble() %>%
rename(rand_intercept = `(Intercept)`, rand_time = time) %>%
mutate(
se_intercept = se_intercept,
se_time = se_time,
lo_time = rand_time - z * se_time,
hi_time = rand_time + z * se_time
) %>%
arrange(rand_time) %>%
mutate(patient = factor(patient, levels = patient))
# 毛毛虫图(随机斜率 + 95% CI)
ggplot(slopes, aes(x = patient, y = rand_time)) +
geom_point(size = 1.4) +
geom_errorbar(aes(ymin = lo_time, ymax = hi_time), width = 0.15) +
geom_hline(yintercept = 0, linetype = 2) +
coord_flip() +
labs(x = "Patient", y = "随机斜率(相对总体均值的偏差)",
title = "随机斜率毛毛虫图(含95% CI)")
coef_df <- broom.mixed::tidy(fit_lmer, effects = "fixed", conf.int = TRUE) %>%
dplyr::mutate(term = forcats::fct_reorder(term, estimate))
p_for <- ggplot(coef_df, aes(x = estimate, y = term)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
geom_vline(xintercept = 0, linetype = 2) +
labs(x = "估计值(95% CI)", y = NULL, title = "固定效应系数森林图") +
theme(axis.text.y = element_text(size = 10))
p_for
lmer
默认不提供 p 值;若想基于
Satterthwaite/Kenward-Roger 得到 p 值,可:
lmerTest::lmer()
拟合;或lme4::lmer()
,再用
parameters::model_parameters()
计算近似 p
值(本文做法)。sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Shanghai
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 gt_1.0.0 patchwork_1.3.2
## [4] ggeffects_2.3.1 performance_0.15.1 parameters_0.28.2
## [7] broom.mixed_0.2.9.6 broom_1.0.10 lme4_1.1-37
## [10] Matrix_1.7-4 nlme_3.1-168 lubridate_1.9.4
## [13] forcats_1.0.0 stringr_1.5.2 dplyr_1.1.4
## [16] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1
## [19] tibble_3.3.0 ggplot2_4.0.0 tidyverse_2.0.0
## [22] showtext_0.9-7 showtextdb_3.0 sysfonts_0.8.9
##
## loaded via a namespace (and not attached):
## [1] sjlabelled_1.2.0 tidyselect_1.2.1 farver_2.1.2 S7_0.2.0
## [5] fastmap_1.2.0 TH.data_1.1-4 bayestestR_0.17.0 digest_0.6.37
## [9] timechange_0.3.0 estimability_1.5.1 lifecycle_1.0.4 survival_3.8-3
## [13] magrittr_2.0.4 compiler_4.4.2 rlang_1.1.6 sass_0.4.10
## [17] tools_4.4.2 yaml_2.3.10 knitr_1.50 labeling_0.4.3
## [21] bit_4.6.0 xml2_1.4.0 RColorBrewer_1.1-3 multcomp_1.4-28
## [25] withr_3.0.2 grid_4.4.2 datawizard_1.2.0 xtable_1.8-4
## [29] future_1.67.0 globals_0.18.0 emmeans_1.11.2-8 MASS_7.3-65
## [33] insight_1.4.2 cli_3.6.5 mvtnorm_1.3-3 crayon_1.5.3
## [37] rmarkdown_2.29 reformulas_0.4.1 generics_0.1.4 rstudioapi_0.17.1
## [41] tzdb_0.5.0 minqa_1.2.8 cachem_1.1.0 splines_4.4.2
## [45] parallel_4.4.2 vctrs_0.6.5 boot_1.3-32 sandwich_3.1-1
## [49] jsonlite_2.0.0 hms_1.1.3 bit64_4.6.0-1 listenv_0.9.1
## [53] jquerylib_0.1.4 glue_1.8.0 parallelly_1.45.1 nloptr_2.2.1
## [57] codetools_0.2-20 stringi_1.8.7 gtable_0.3.6 furrr_0.3.1
## [61] pillar_1.11.1 htmltools_0.5.8.1 R6_2.6.1 Rdpack_2.6.4
## [65] vroom_1.6.6 evaluate_1.0.5 lattice_0.22-7 haven_2.5.5
## [69] rbibutils_2.3 backports_1.5.0 snakecase_0.11.1 bslib_0.9.0
## [73] Rcpp_1.1.0 coda_0.19-4.1 xfun_0.53 zoo_1.8-14
## [77] pkgconfig_2.0.3