目标

认识 Cox 比例风险（PH）模型的基本形式与风险比 HR的含义。
在 R 中使用 survival::coxph() 进行单变量与多变量 Cox 回归。
学会提取与整理结果表（HR、95%CI、P 值）并绘制森林图。
进行PH 假设检验与诊断可视化（cox.zph()、残差图）。（xhs：飞高高啦）
绘制校正协变量后的预测生存曲线（survfit() + survminer）。

本文数据来自 survival 包内置的 lung 数据集。

准备与数据理解

# 必要包（均为 CRAN）
library(survival)
library(survminer)
library(dplyr)
library(tidyr)
library(broom)      # 整理模型结果
library(ggplot2)
library(knitr)
library(kableExtra)

data("lung", package = "survival")
lung_raw <- lung

# 变量重编码：
# status: 1 = 删失, 2 = 发生事件 -> 转换为 0/1 事件指标
# sex: 1 = 男, 2 = 女 -> 因子并加标签
lung2 <- lung_raw %>%
  mutate(
    event = as.integer(status == 2),
    sex   = factor(sex, levels = c(1, 2), labels = c("Male", "Female"))
  )

# 查看前几行
head(lung2) %>% kable() %>% kable_styling(full_width = FALSE)

inst	time	status	age	sex	ph.ecog	ph.karno	pat.karno	meal.cal	wt.loss	event
3	306	2	74	Male	1	90	100	1175	NA	1
3	455	2	68	Male	0	90	90	1225	15	1
3	1010	1	56	Male	0	90	90	NA	15	0
5	210	2	57	Male	1	90	60	1150	11	1
1	883	2	60	Male	0	100	90	NA	0	1
12	1022	1	74	Male	1	50	80	513	0	0

# 简要缺失情况
lung2 %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "var", values_to = "n_miss") %>%
  arrange(desc(n_miss)) %>%
  kable(col.names = c("变量", "缺失数")) %>%
  kable_styling(full_width = FALSE)

变量 \|	失数\|
meal.cal	47
wt.loss	14
pat.karno	3
inst	1
ph.ecog	1
ph.karno	1
time	0
status	0
age	0
sex	0
event	0

时间与事件：后续模型用 Surv(time, event)，其中 time 为天，event 为 1=事件发生、0=删失。

单变量 Cox 回归（xhs：飞高高啦）

我们以 5 个协变量为例：age, sex, ph.ecog, ph.karno, wt.loss。

covariates <- c("age", "sex", "ph.ecog", "ph.karno", "wt.loss")

# 构建公式并逐一拟合
univ_models <- lapply(
  covariates,
  function(v) coxph(as.formula(paste0("Surv(time, event) ~ ", v)), data = lung2)
)
names(univ_models) <- covariates

# 整理为结果表
univ_tbl <- purrr::map_dfr(
  univ_models,
  ~ broom::tidy(.x, conf.int = TRUE, exponentiate = TRUE),
  .id = "variable"
) %>%
  mutate(term = ifelse(term == variable, as.character(term),
                       paste(variable, term, sep=":"))) %>%
  select(变量 = term,
         HR = estimate,
         CI下限 = conf.low,
         CI上限 = conf.high,
         `Wald z` = statistic,
         `P值` = p.value) %>%
  mutate(
    HR = sprintf("%.2f", HR),
    CI下限 = sprintf("%.2f", CI下限),
    CI上限 = sprintf("%.2f", CI上限),
    `HR (95%CI)` = paste0(HR, " (", CI下限, "–", CI上限, ")")
  ) %>%
  select(变量, `HR (95%CI)`, `Wald z`, `P值`)

univ_tbl

## # A tibble: 5 × 4
##   变量          `HR (95%CI)`     `Wald z`       P值
##   <chr>         <chr>               <dbl>     <dbl>
## 1 age           1.02 (1.00–1.04)    2.03  0.0419   
## 2 sex:sexFemale 0.59 (0.42–0.82)   -3.18  0.00149  
## 3 ph.ecog       1.61 (1.29–2.01)    4.20  0.0000269
## 4 ph.karno      0.98 (0.97–1.00)   -2.81  0.00496  
## 5 wt.loss       1.00 (0.99–1.01)    0.217 0.828

解释要点： - 当 HR > 1，协变量升高（或相对于参照组）与风险增加相关；HR < 1 与风险降低相关。
- 关注 P值 与 HR 的 95%CI 是否跨越 1。

多变量 Cox 回归（xhs：飞高高啦）

根据上一步筛选（示例中使用 age + sex + ph.ecog），拟合多变量模型：

fit_full <- coxph(Surv(time, event) ~ age + sex + ph.ecog, data = lung2, ties = "efron")
summary(fit_full)

## Call:
## coxph(formula = Surv(time, event) ~ age + sex + ph.ecog, data = lung2, 
##     ties = "efron")
## 
##   n= 227, number of events= 164 
##    (1 observation deleted due to missingness)
## 
##                coef exp(coef)  se(coef)      z Pr(>|z|)    
## age        0.011067  1.011128  0.009267  1.194 0.232416    
## sexFemale -0.552612  0.575445  0.167739 -3.294 0.000986 ***
## ph.ecog    0.463728  1.589991  0.113577  4.083 4.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## age          1.0111     0.9890    0.9929    1.0297
## sexFemale    0.5754     1.7378    0.4142    0.7994
## ph.ecog      1.5900     0.6289    1.2727    1.9864
## 
## Concordance= 0.637  (se = 0.025 )
## Likelihood ratio test= 30.5  on 3 df,   p=1e-06
## Wald test            = 29.93  on 3 df,   p=1e-06
## Score (logrank) test = 30.5  on 3 df,   p=1e-06

multiv_tbl <- broom::tidy(fit_full, conf.int = TRUE, exponentiate = TRUE) %>%
  transmute(
    协变量 = term,
    HR = sprintf("%.2f", estimate),
    `95%CI` = paste0(sprintf("%.2f", conf.low), "–", sprintf("%.2f", conf.high)),
    `HR (95%CI)` = paste0(HR, " (", `95%CI`, ")"),
    `Wald z` = sprintf("%.2f", statistic),
    `P值` = format.pval(p.value, eps = 1e-4)
  )

kable(multiv_tbl, align = "lcccc") %>% kable_styling(full_width = FALSE)

协变量 \|	R \|	95%CI \|	HR (95%CI) \| W	ld z \|P值
age	1.01	0.99–1.03	1.01 (0.99–1.03)	1.19	0.23241568
sexFemale	0.58	0.41–0.80	0.58 (0.41–0.80)	-3.29	0.00098605
ph.ecog	1.59	1.27–1.99	1.59 (1.27–1.99)	4.08	< 1e-04

结果解读示例：在控制其余变量后，sexFemale 的 HR 若为 0.58（示例值），可解读为：女性相较男性死亡风险降低 42%。ph.ecog 若 HR≈1.59，表示每升高 1 分，风险增加约 59%。

PH 假设检验与诊断（xhs：飞高高啦）

Cox 模型关键假设：各组风险比随时间恒定（比例风险）。用 cox.zph() 检验：

ph_test <- cox.zph(fit_full)
ph_test

##         chisq df    p
## age     0.188  1 0.66
## sex     2.305  1 0.13
## ph.ecog 2.054  1 0.15
## GLOBAL  4.464  3 0.22

若整体检验与各项检验 P 值均 > 0.05，通常认为满足 PH 假设。
可视化查看 Schoenfeld 残差 vs. 时间的趋势：

plot(ph_test)

abline(h = 0, lty = 2)

若出现显著偏离，可考虑分层 Cox、加入时间交互项或分段时间效应等扩展。

预测生存曲线（校正协变量）（xhs：飞高高啦）

用 survfit() 计算在指定协变量组合下的预测生存曲线。示例：比较男女、保持其他变量为固定值。

# 设定 ph.ecog 固定为 1；年龄固定为样本均值
newdat <- with(lung2, data.frame(
  age = rep(mean(age, na.rm = TRUE), 2),
  sex = factor(c("Male", "Female"), levels = levels(sex)),
  ph.ecog = c(1, 1)
))

newdat

##        age    sex ph.ecog
## 1 62.44737   Male       1
## 2 62.44737 Female       1

fit_sf <- survfit(fit_full, newdata = newdat)
ggsurvplot(
  fit_sf,
  data = newdat,
  conf.int = TRUE,
  legend.title = "Sex",
  legend.labs = c("Male", "Female"),
  ggtheme = theme_minimal(),
  xlab = "Days",
  ylab = "Predicted survival probability"
)

森林图（多变量 HR）（xhs：飞高高啦）

使用 ggplot2 自制简洁森林图：

plot_dat <- broom::tidy(fit_full, conf.int = TRUE, exponentiate = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = dplyr::recode(term, "age" = "Age (years)", "sexFemale" = "Sex: Female vs Male",
                         "ph.ecog" = "ECOG score"),
    term = factor(term, levels = rev(c("Age (years)", "Sex: Female vs Male", "ECOG score")))
  )

ggplot(plot_dat, aes(x = estimate, y = term)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
  geom_vline(xintercept = 1, linetype = 2) +
  scale_x_continuous(trans = "log10") +
  labs(x = "Hazard Ratio (log scale)", y = NULL, title = "Forest plot of multivariable Cox model") +
  theme_minimal(base_size = 12)

残差诊断（可选）（xhs：飞高高啦）

# 1) 计算 Martingale 残差（长度 = 模型使用的样本数）
res_mart <- residuals(fit_full, type = "martingale")

# 2) 取出模型实际用到的数据（已按同一行顺序、去掉缺失）
df_used <- model.frame(fit_full)  # 包含 age, sex, ph.ecog, Surv(time, event)

# 3) 绑定残差后作图
df_plot <- transform(df_used, res_mart = res_mart)

library(ggplot2)
ggplot(df_plot, aes(x = age, y = res_mart)) +
  geom_point(alpha = 0.7) +
  geom_smooth(se = FALSE, method = "loess") +
  labs(x = "Age", y = "Martingale residuals")

res_dev <- residuals(fit_full, type = "deviance")
df_plot2 <- transform(df_used, res_dev = res_dev)

ggplot(df_plot2, aes(x = seq_along(res_dev), y = res_dev)) +
  geom_point(alpha = 0.7) +
  geom_hline(yintercept = 0, linetype = 2) +
  labs(x = "Index", y = "Deviance residuals")

若发现异常点或非线性，可考虑：对连续变量进行样条/分段建模（如 splines::ns()）、或对异常点做敏感性分析。

报告撰写模版（可直接复制）

统计方法（示例）：
> 采用 Cox 比例风险模型评估协变量与死亡风险的关联，以时间（天）为时标，删失定义为随访期内未发生事件或失访。报告风险比（HR）及其 95% 置信区间。使用 Efron 方法处理 ties。模型纳入的协变量包括年龄、性别与 ECOG 评分。通过 Schoenfeld 残差检验与可视化检查比例风险假设。使用 survfit() 在给定协变量组合下绘制预测生存曲线。

结果解释（示例）：
> 多变量模型显示，女性相较男性 HR=0.xx（95%CI: a–b，P<0.001），提示女性为良好预后因素；ECOG 每升高 1 分 HR=1.yy（95%CI: c–d，P<0.001），提示体能状态差与风险增加相关；年龄的效应在调整后不显著（HR≈1.01，P=0.23）。整体与分量检验均提示模型拟合良好，PH 假设未被拒绝。

附录：常见问题（FAQ）

PH 假设不满足怎么办？
- 分层 Cox（strata(var)）；
- 加入时间交互项（var:tt(time) 或 coxph(..., tt = function(x, t, ...) )）；
- 使用分段时间或 AFT 模型作为替代；
- 对显著非比例的协变量采用时变系数。
如何处理连续变量的非线性？
- 使用自然样条 splines::ns() 或平滑函数；比较 AIC/BIC 或可视化残差。
如何导出表格与图形？
- 表格：knitr::kable()/gt::gt() 保存为 HTML/Word/PDF。
- 图：ggsave("figure.png", dpi = 300, width = 7, height = 5)。

```

R 语言——Cox 比例风险模型）

xhs：飞高高啦

2025-9-20

目标