目标

本文数据来自 survival 包内置的 lung 数据集。

准备与数据理解

# 必要包(均为 CRAN)
library(survival)
library(survminer)
library(dplyr)
library(tidyr)
library(broom)      # 整理模型结果
library(ggplot2)
library(knitr)
library(kableExtra)
data("lung", package = "survival")
lung_raw <- lung

# 变量重编码:
# status: 1 = 删失, 2 = 发生事件 -> 转换为 0/1 事件指标
# sex: 1 = 男, 2 = 女 -> 因子并加标签
lung2 <- lung_raw %>%
  mutate(
    event = as.integer(status == 2),
    sex   = factor(sex, levels = c(1, 2), labels = c("Male", "Female"))
  )

# 查看前几行
head(lung2) %>% kable() %>% kable_styling(full_width = FALSE)
inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss event
3 306 2 74 Male 1 90 100 1175 NA 1
3 455 2 68 Male 0 90 90 1225 15 1
3 1010 1 56 Male 0 90 90 NA 15 0
5 210 2 57 Male 1 90 60 1150 11 1
1 883 2 60 Male 0 100 90 NA 0 1
12 1022 1 74 Male 1 50 80 513 0 0
# 简要缺失情况
lung2 %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "var", values_to = "n_miss") %>%
  arrange(desc(n_miss)) %>%
  kable(col.names = c("变量", "缺失数")) %>%
  kable_styling(full_width = FALSE)
变量 | 失数|
meal.cal 47
wt.loss 14
pat.karno 3
inst 1
ph.ecog 1
ph.karno 1
time 0
status 0
age 0
sex 0
event 0

时间与事件:后续模型用 Surv(time, event),其中 time 为天,event 为 1=事件发生、0=删失。

单变量 Cox 回归(xhs:飞高高啦)

我们以 5 个协变量为例:age, sex, ph.ecog, ph.karno, wt.loss

covariates <- c("age", "sex", "ph.ecog", "ph.karno", "wt.loss")

# 构建公式并逐一拟合
univ_models <- lapply(
  covariates,
  function(v) coxph(as.formula(paste0("Surv(time, event) ~ ", v)), data = lung2)
)
names(univ_models) <- covariates

# 整理为结果表
univ_tbl <- purrr::map_dfr(
  univ_models,
  ~ broom::tidy(.x, conf.int = TRUE, exponentiate = TRUE),
  .id = "variable"
) %>%
  mutate(term = ifelse(term == variable, as.character(term),
                       paste(variable, term, sep=":"))) %>%
  select(变量 = term,
         HR = estimate,
         CI下限 = conf.low,
         CI上限 = conf.high,
         `Wald z` = statistic,
         `P值` = p.value) %>%
  mutate(
    HR = sprintf("%.2f", HR),
    CI下限 = sprintf("%.2f", CI下限),
    CI上限 = sprintf("%.2f", CI上限),
    `HR (95%CI)` = paste0(HR, " (", CI下限, "–", CI上限, ")")
  ) %>%
  select(变量, `HR (95%CI)`, `Wald z`, `P值`)

univ_tbl
## # A tibble: 5 × 4
##   变量          `HR (95%CI)`     `Wald z`       P值
##   <chr>         <chr>               <dbl>     <dbl>
## 1 age           1.02 (1.00–1.04)    2.03  0.0419   
## 2 sex:sexFemale 0.59 (0.42–0.82)   -3.18  0.00149  
## 3 ph.ecog       1.61 (1.29–2.01)    4.20  0.0000269
## 4 ph.karno      0.98 (0.97–1.00)   -2.81  0.00496  
## 5 wt.loss       1.00 (0.99–1.01)    0.217 0.828

解释要点: - 当 HR > 1,协变量升高(或相对于参照组)与风险增加相关;HR < 1 与风险降低相关。
- 关注 P值 与 HR 的 95%CI 是否跨越 1。

多变量 Cox 回归(xhs:飞高高啦)

根据上一步筛选(示例中使用 age + sex + ph.ecog),拟合多变量模型:

fit_full <- coxph(Surv(time, event) ~ age + sex + ph.ecog, data = lung2, ties = "efron")
summary(fit_full)
## Call:
## coxph(formula = Surv(time, event) ~ age + sex + ph.ecog, data = lung2, 
##     ties = "efron")
## 
##   n= 227, number of events= 164 
##    (1 observation deleted due to missingness)
## 
##                coef exp(coef)  se(coef)      z Pr(>|z|)    
## age        0.011067  1.011128  0.009267  1.194 0.232416    
## sexFemale -0.552612  0.575445  0.167739 -3.294 0.000986 ***
## ph.ecog    0.463728  1.589991  0.113577  4.083 4.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## age          1.0111     0.9890    0.9929    1.0297
## sexFemale    0.5754     1.7378    0.4142    0.7994
## ph.ecog      1.5900     0.6289    1.2727    1.9864
## 
## Concordance= 0.637  (se = 0.025 )
## Likelihood ratio test= 30.5  on 3 df,   p=1e-06
## Wald test            = 29.93  on 3 df,   p=1e-06
## Score (logrank) test = 30.5  on 3 df,   p=1e-06
multiv_tbl <- broom::tidy(fit_full, conf.int = TRUE, exponentiate = TRUE) %>%
  transmute(
    协变量 = term,
    HR = sprintf("%.2f", estimate),
    `95%CI` = paste0(sprintf("%.2f", conf.low), "–", sprintf("%.2f", conf.high)),
    `HR (95%CI)` = paste0(HR, " (", `95%CI`, ")"),
    `Wald z` = sprintf("%.2f", statistic),
    `P值` = format.pval(p.value, eps = 1e-4)
  )

kable(multiv_tbl, align = "lcccc") %>% kable_styling(full_width = FALSE)
协变量 | R | 95%CI | HR (95%CI) | W ld z |P值
age 1.01 0.99–1.03 1.01 (0.99–1.03) 1.19 0.23241568
sexFemale 0.58 0.41–0.80 0.58 (0.41–0.80) -3.29 0.00098605
ph.ecog 1.59 1.27–1.99 1.59 (1.27–1.99) 4.08 < 1e-04

结果解读示例:在控制其余变量后,sexFemale 的 HR 若为 0.58(示例值),可解读为:女性相较男性死亡风险降低 42%。ph.ecog 若 HR≈1.59,表示每升高 1 分,风险增加约 59%。

PH 假设检验与诊断(xhs:飞高高啦)

Cox 模型关键假设:各组风险比随时间恒定(比例风险)。用 cox.zph() 检验:

ph_test <- cox.zph(fit_full)
ph_test
##         chisq df    p
## age     0.188  1 0.66
## sex     2.305  1 0.13
## ph.ecog 2.054  1 0.15
## GLOBAL  4.464  3 0.22
plot(ph_test)

abline(h = 0, lty = 2)

若出现显著偏离,可考虑分层 Cox、加入时间交互项或分段时间效应等扩展。

预测生存曲线(校正协变量)(xhs:飞高高啦)

survfit() 计算在指定协变量组合下的预测生存曲线。示例:比较男女、保持其他变量为固定值。

# 设定 ph.ecog 固定为 1;年龄固定为样本均值
newdat <- with(lung2, data.frame(
  age = rep(mean(age, na.rm = TRUE), 2),
  sex = factor(c("Male", "Female"), levels = levels(sex)),
  ph.ecog = c(1, 1)
))

newdat
##        age    sex ph.ecog
## 1 62.44737   Male       1
## 2 62.44737 Female       1
fit_sf <- survfit(fit_full, newdata = newdat)
ggsurvplot(
  fit_sf,
  data = newdat,
  conf.int = TRUE,
  legend.title = "Sex",
  legend.labs = c("Male", "Female"),
  ggtheme = theme_minimal(),
  xlab = "Days",
  ylab = "Predicted survival probability"
)

森林图(多变量 HR)(xhs:飞高高啦)

使用 ggplot2 自制简洁森林图:

plot_dat <- broom::tidy(fit_full, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = dplyr::recode(term, "age" = "Age (years)", "sexFemale" = "Sex: Female vs Male",
                         "ph.ecog" = "ECOG score"),
    term = factor(term, levels = rev(c("Age (years)", "Sex: Female vs Male", "ECOG score")))
  )

ggplot(plot_dat, aes(x = estimate, y = term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
  geom_vline(xintercept = 1, linetype = 2) +
  scale_x_continuous(trans = "log10") +
  labs(x = "Hazard Ratio (log scale)", y = NULL, title = "Forest plot of multivariable Cox model") +
  theme_minimal(base_size = 12)

残差诊断(可选)(xhs:飞高高啦)

# 1) 计算 Martingale 残差(长度 = 模型使用的样本数)
res_mart <- residuals(fit_full, type = "martingale")

# 2) 取出模型实际用到的数据(已按同一行顺序、去掉缺失)
df_used <- model.frame(fit_full)  # 包含 age, sex, ph.ecog, Surv(time, event)

# 3) 绑定残差后作图
df_plot <- transform(df_used, res_mart = res_mart)

library(ggplot2)
ggplot(df_plot, aes(x = age, y = res_mart)) +
  geom_point(alpha = 0.7) +
  geom_smooth(se = FALSE, method = "loess") +
  labs(x = "Age", y = "Martingale residuals")

res_dev <- residuals(fit_full, type = "deviance")
df_plot2 <- transform(df_used, res_dev = res_dev)

ggplot(df_plot2, aes(x = seq_along(res_dev), y = res_dev)) +
  geom_point(alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = 2) +
  labs(x = "Index", y = "Deviance residuals")

若发现异常点或非线性,可考虑:对连续变量进行样条/分段建模(如 splines::ns())、或对异常点做敏感性分析。

报告撰写模版(可直接复制)

统计方法(示例):
> 采用 Cox 比例风险模型评估协变量与死亡风险的关联,以时间(天)为时标,删失定义为随访期内未发生事件或失访。报告风险比(HR)及其 95% 置信区间。使用 Efron 方法处理 ties。模型纳入的协变量包括年龄、性别与 ECOG 评分。通过 Schoenfeld 残差检验与可视化检查比例风险假设。使用 survfit() 在给定协变量组合下绘制预测生存曲线。

结果解释(示例):
> 多变量模型显示,女性相较男性 HR=0.xx(95%CI: a–b,P<0.001),提示女性为良好预后因素;ECOG 每升高 1 分 HR=1.yy(95%CI: c–d,P<0.001),提示体能状态差与风险增加相关;年龄的效应在调整后不显著(HR≈1.01,P=0.23)。整体与分量检验均提示模型拟合良好,PH 假设未被拒绝。


附录:常见问题(FAQ)

  1. PH 假设不满足怎么办?
    • 分层 Cox(strata(var));
    • 加入时间交互项(var:tt(time)coxph(..., tt = function(x, t, ...) ));
    • 使用分段时间或 AFT 模型作为替代;
    • 对显著非比例的协变量采用时变系数。
  2. 如何处理连续变量的非线性?
    • 使用自然样条 splines::ns() 或平滑函数;比较 AIC/BIC 或可视化残差。
  3. 如何导出表格与图形?
    • 表格:knitr::kable()/gt::gt() 保存为 HTML/Word/PDF。
    • 图:ggsave("figure.png", dpi = 300, width = 7, height = 5)

```