生存分析

数据准备

在此采用 easyTCGA 下载好的乳腺癌数据进行生存分析。

library(easyTCGA)

getmrnaexpr('TCGA-BRCA')

加载表达矩阵和临床信息。

load("D://Rlearn/TCGA/output_mRNA_lncRNA_expr/TCGA-BRCA_mrna_expr_counts.rdata")
load("D://Rlearn/TCGA/output_mRNA_lncRNA_expr/TCGA-BRCA_clinical.rdata")
dim(clin_info)
[1] 1231   87
head(colnames(clin_info))
[1] "barcode"             "patient"             "sample"             
[4] "shortLetterCode"     "definition"          "sample_submitter_id"

数据整理

  1. 处理表达矩阵:加载的表达矩阵已经是处理好的了
  2. 处理临床信息:加载的临床信息已经是处理好的了
  3. 两个数据的样本量及顺序需要一致
dim(clin_info)
[1] 1231   87
dim(mrna_expr_counts)
[1] 19938  1231

处理生存分析需要用到的临床类型:

1clin_info$days_to_death[is.na(clin_info$days_to_death)] <- 0
clin_info$days_to_last_follow_up[is.na(clin_info$days_to_last_follow_up)] <- 0  
clin_info$days <- ifelse(
  clin_info$vital_status == 'Alive',
  clin_info$days_to_last_follow_up,clin_info$days_to_death
2)
3clin_info$time <- round(clin_info$days/30, 2)

table(clin_info$event)

4clin_info$age_at_index[is.na(clin_info$age_at_index)] <- 0
clin_info$age_at_index <- as.numeric(clin_info$age_at_index)
clin_info$age_group <- ifelse(clin_info$age_at_index>median(clin_info$age_at_index), 'older', 'younger')
table(clin_info$age_group)

5table(clin_info$ajcc_pathologic_stage)
6table(clin_info$race)
7table(clin_info$gender)
1
缺失值标记为 0
2
计算生存时间,如果患者存活,采用days_to_last_follow_up;患者死亡,采用days_to_death
3
时间以月份记,保留两位小数
4
年龄分组(按照中位数)
5
癌症阶段
6
人种
7
性别

以下是上述代码块的运行结果:

clin_info$days_to_death[is.na(clin_info$days_to_death)] <- 0  
clin_info$days_to_last_follow_up[is.na(clin_info$days_to_last_follow_up)] <- 0  
clin_info$days <- ifelse(clin_info$vital_status == 'Alive',clin_info$days_to_last_follow_up,clin_info$days_to_death)

clin_info$time <- round(clin_info$days/30, 2)

clin_info$event <- ifelse(clin_info$vital_status == 'Alive', 0, 1)
table(clin_info$event)

   0    1 
1029  201 
clin_info$age_at_index[is.na(clin_info$age_at_index)] <- 0
clin_info$age_at_index <- as.numeric(clin_info$age_at_index)
clin_info$age_group <- ifelse(clin_info$age_at_index>median(clin_info$age_at_index), 'older', 'younger')
table(clin_info$age_group)

  older younger 
    603     628 
table(clin_info$ajcc_pathologic_stage)

   Stage I   Stage IA   Stage IB   Stage II  Stage IIA  Stage IIB  Stage III 
       106         92          6          6        402        293          2 
Stage IIIA Stage IIIB Stage IIIC   Stage IV    Stage X 
       174         31         71         22         13 
table(clin_info$race)

american indian or alaska native                            asian 
                               1                               62 
       black or african american                     not reported 
                             191                               96 
                           white 
                             880 
table(clin_info$gender)

female   male 
  1217     13 
head(table(clin_info$time))

-0.23     0  0.03  0.17  0.23  0.27 
    1    22     4     3     1     1 

保存数据:

save(mrna_expr_counts, clin_info, file="surv.RData")

临床信息相关的生存分析

library(survival)
Warning: package 'survival' was built under R version 4.3.2
library(survminer)
Warning: package 'survminer' was built under R version 4.3.2
Loading required package: ggplot2
Warning: package 'ggplot2' was built under R version 4.3.2
Loading required package: ggpubr
Warning: package 'ggpubr' was built under R version 4.3.2

Attaching package: 'survminer'
The following object is masked from 'package:survival':

    myeloma
sfit <- survfit(Surv(time, event)~paper_pathologic_stage, data = clin_info)
print(sfit)
Call: survfit(formula = Surv(time, event) ~ paper_pathologic_stage, 
    data = clin_info)

   133 observations deleted due to missingness 
                                   n events median 0.95LCL 0.95UCL
paper_pathologic_stage=NA         24     11   98.8    79.1      NA
paper_pathologic_stage=Stage_I   184     16  132.0   131.5      NA
paper_pathologic_stage=Stage_II  623     68  148.5   122.3      NA
paper_pathologic_stage=Stage_III 248     44     NA    98.8      NA
paper_pathologic_stage=Stage_IV   19     14   37.6    26.4      NA
ggsurvplot(sfit, conf.int=F, pval=TRUE)

生存曲线(survival curve)则是将每个时间点的生存率连接在一起的曲线,一般随访时间为 X 轴,生存率为 Y 轴;曲线平滑则说明高生存率,反之则低生存率;中位生存率(median survival time)越长,则说明预后较好。

  • Surv(time, event)根据生死状态,标记出哪些生存时间数据是删失值,用加号表示区分

  • survfit()用于计算生存曲线,属于 Kaplan-Meier 方法;~paper_pathologic_stage表示以临床分级分组绘图,注意此方法仅适用二分类变量进行分组。如果不想分组,只看总体,~1即可

  • 最后配合 ggsurvplot()函数可视化生存曲线;通过设置参数,可在生存曲线基础上,再绘制两张辅助图

ggsurvplot(
  sfit,
  risk.table =TRUE, pval =TRUE,
  xlab = "Time in days",  
  conf.int =TRUE,
  # 添加中位生存
  surv.median.line = "hv",
  ggtheme =theme_light(), 
)

对年龄和性别作生存分析拼图:

sfit1 <- survfit(Surv(time, event)~gender, data=clin_info)
sfit2 <- survfit(Surv(time, event)~age_group, data=clin_info)
splots <- list()
splots[[1]] <- ggsurvplot(sfit1,pval =TRUE, data = clin_info, risk.table = TRUE)
splots[[2]] <- ggsurvplot(sfit2,pval =TRUE, data = clin_info, risk.table = TRUE)
arrange_ggsurvplots(
  splots, print = TRUE,  
  ncol = 2, nrow = 1, 
  risk.table.height = 0.4
)

基因表达的生存分析

设定某一基因的表达基准,将临床样本分为高表达 high 与低表达 low 两组,再进行生存分析。

exprSet <- mrna_expr_counts
# 选取基因
genename <- "CD24"

# 基因分组
clin_info$Gene <- t(ifelse(
  exprSet[genename,] > median(as.numeric(exprSet[genename,])),
  'High','Low'
))

sfit1 <- survfit(Surv(time, event)~Gene, data = clin_info)
ggsurvplot(sfit1,pval = TRUE, data = clin_info, risk.table = TRUE)

多个基因拼图:

genenames <- c("SLC16A2", "CD74", "VDAC1", "ATG4A")
splots <- lapply(
  genenames, 
  function(genename){
    clin_info$gene = t(ifelse(
      exprSet[genename,] > median(as.numeric(exprSet[genename,])),
      'High','Low'))
    sfit1 = survfit(Surv(time, event)~gene, data = clin_info)
    ggsurvplot(sfit1,pval = TRUE, data = clin_info, risk.table = TRUE)
}) 
arrange_ggsurvplots(splots, print = TRUE,  
                    ncol = 2, nrow = 2, risk.table.height = 0.4)

关于 ggsurvplot()

可以自定义许多选项,函数参数:

ggsurvplot(
  fit,
  data = NULL,
  fun = NULL,
  color = NULL,
  palette = NULL,
  linetype = 1,
  conf.int = FALSE,
  pval = FALSE,
  pval.method = FALSE,
  test.for.trend = FALSE,
  surv.median.line = "none",
  risk.table = FALSE,
  cumevents = FALSE,
  cumcensor = FALSE,
  tables.height = 0.25,
  group.by = NULL,
  facet.by = NULL,
  add.all = FALSE,
  combine = FALSE,
  ggtheme = theme_survminer(),
  tables.theme = ggtheme,
  ...
)

详见 ?ggsurvplot

工作环境

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.1 (2023-06-16 ucrt)
 os       Windows 11 x64 (build 22621)
 system   x86_64, mingw32
 ui       RTerm
 language (EN)
 collate  Chinese (Simplified)_China.utf8
 ctype    Chinese (Simplified)_China.utf8
 tz       Asia/Hong_Kong
 date     2023-11-07
 pandoc   3.1.9 @ C:/Users/HANWAN~1/AppData/Local/Pandoc/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version date (UTC) lib source
 abind         1.4-5   2016-07-21 [1] CRAN (R 4.3.0)
 backports     1.4.1   2021-12-13 [1] CRAN (R 4.3.0)
 broom         1.0.5   2023-06-09 [1] CRAN (R 4.3.2)
 cachem        1.0.8   2023-05-01 [1] CRAN (R 4.3.1)
 callr         3.7.3   2022-11-02 [1] CRAN (R 4.3.1)
 car           3.1-2   2023-03-30 [1] CRAN (R 4.3.2)
 carData       3.0-5   2022-01-06 [1] CRAN (R 4.3.2)
 cli           3.6.1   2023-03-23 [1] CRAN (R 4.3.1)
 colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.3.1)
 commonmark    1.9.0   2023-03-17 [1] CRAN (R 4.3.1)
 crayon        1.5.2   2022-09-29 [1] CRAN (R 4.3.1)
 data.table    1.14.8  2023-02-17 [1] CRAN (R 4.3.2)
 devtools      2.4.5   2022-10-11 [1] CRAN (R 4.3.2)
 digest        0.6.33  2023-07-07 [1] CRAN (R 4.3.1)
 dplyr         1.1.3   2023-09-03 [1] CRAN (R 4.3.2)
 ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.3.1)
 evaluate      0.21    2023-05-05 [1] CRAN (R 4.3.1)
 fansi         1.0.4   2023-01-22 [1] CRAN (R 4.3.1)
 farver        2.1.1   2022-07-06 [1] CRAN (R 4.3.1)
 fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.1)
 fs            1.6.3   2023-07-20 [1] CRAN (R 4.3.1)
 generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.1)
 ggplot2     * 3.4.4   2023-10-12 [1] CRAN (R 4.3.2)
 ggpubr      * 0.6.0   2023-02-10 [1] CRAN (R 4.3.2)
 ggsignif      0.6.4   2022-10-13 [1] CRAN (R 4.3.2)
 ggtext        0.1.2   2022-09-16 [1] CRAN (R 4.3.2)
 glue          1.6.2   2022-02-24 [1] CRAN (R 4.3.1)
 gridExtra     2.3     2017-09-09 [1] CRAN (R 4.3.1)
 gridtext      0.1.5   2022-09-16 [1] CRAN (R 4.3.2)
 gtable        0.3.3   2023-03-21 [1] CRAN (R 4.3.1)
 htmltools     0.5.5   2023-03-23 [1] CRAN (R 4.3.1)
 htmlwidgets   1.6.2   2023-03-17 [1] CRAN (R 4.3.1)
 httpuv        1.6.11  2023-05-11 [1] CRAN (R 4.3.1)
 jsonlite      1.8.7   2023-06-29 [1] CRAN (R 4.3.1)
 km.ci         0.5-6   2022-04-06 [1] CRAN (R 4.3.2)
 KMsurv        0.1-5   2012-12-03 [1] CRAN (R 4.3.1)
 knitr         1.43    2023-05-25 [1] CRAN (R 4.3.1)
 labeling      0.4.2   2020-10-20 [1] CRAN (R 4.3.0)
 later         1.3.1   2023-05-02 [1] CRAN (R 4.3.1)
 lattice       0.21-8  2023-04-05 [2] CRAN (R 4.3.1)
 lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.3.1)
 magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.1)
 markdown      1.11    2023-10-19 [1] CRAN (R 4.3.2)
 Matrix        1.6-1.1 2023-09-18 [1] CRAN (R 4.3.2)
 memoise       2.0.1   2021-11-26 [1] CRAN (R 4.3.1)
 mime          0.12    2021-09-28 [1] CRAN (R 4.3.0)
 miniUI        0.1.1.1 2018-05-18 [1] CRAN (R 4.3.1)
 munsell       0.5.0   2018-06-12 [1] CRAN (R 4.3.1)
 pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.1)
 pkgbuild      1.4.2   2023-06-26 [1] CRAN (R 4.3.1)
 pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.1)
 pkgload       1.3.2.1 2023-07-08 [1] CRAN (R 4.3.1)
 prettyunits   1.1.1   2020-01-24 [1] CRAN (R 4.3.1)
 processx      3.8.2   2023-06-30 [1] CRAN (R 4.3.1)
 profvis       0.3.8   2023-05-02 [1] CRAN (R 4.3.1)
 promises      1.2.0.1 2021-02-11 [1] CRAN (R 4.3.1)
 ps            1.7.5   2023-04-18 [1] CRAN (R 4.3.1)
 purrr         1.0.2   2023-08-10 [1] CRAN (R 4.3.2)
 R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.1)
 Rcpp          1.0.11  2023-07-06 [1] CRAN (R 4.3.1)
 remotes       2.4.2.1 2023-07-18 [1] CRAN (R 4.3.1)
 rlang         1.1.1   2023-04-28 [1] CRAN (R 4.3.1)
 rmarkdown     2.23    2023-07-01 [1] CRAN (R 4.3.1)
 rstatix       0.7.2   2023-02-01 [1] CRAN (R 4.3.2)
 rstudioapi    0.15.0  2023-07-07 [1] CRAN (R 4.3.1)
 scales        1.2.1   2022-08-20 [1] CRAN (R 4.3.1)
 sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.1)
 shiny         1.7.4.1 2023-07-06 [1] CRAN (R 4.3.1)
 stringi       1.7.12  2023-01-11 [1] CRAN (R 4.3.0)
 stringr       1.5.0   2022-12-02 [1] CRAN (R 4.3.1)
 survival    * 3.5-7   2023-08-14 [1] CRAN (R 4.3.2)
 survminer   * 0.4.9   2021-03-09 [1] CRAN (R 4.3.2)
 survMisc      0.5.6   2022-04-07 [1] CRAN (R 4.3.2)
 tibble        3.2.1   2023-03-20 [1] CRAN (R 4.3.1)
 tidyr         1.3.0   2023-01-24 [1] CRAN (R 4.3.1)
 tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.3.1)
 urlchecker    1.0.1   2021-11-30 [1] CRAN (R 4.3.1)
 usethis       2.2.2   2023-07-06 [1] CRAN (R 4.3.1)
 utf8          1.2.3   2023-01-31 [1] CRAN (R 4.3.1)
 vctrs         0.6.3   2023-06-14 [1] CRAN (R 4.3.1)
 withr         2.5.0   2022-03-03 [1] CRAN (R 4.3.1)
 xfun          0.39    2023-04-20 [1] CRAN (R 4.3.1)
 xml2          1.3.5   2023-07-06 [1] CRAN (R 4.3.1)
 xtable        1.8-4   2019-04-21 [1] CRAN (R 4.3.1)
 yaml          2.3.7   2023-01-23 [1] CRAN (R 4.3.0)
 zoo           1.8-12  2023-04-13 [1] CRAN (R 4.3.1)

 [1] C:/Users/Han Wang/AppData/Local/R/win-library/4.3
 [2] C:/Program Files/R/R-4.3.1/library

──────────────────────────────────────────────────────────────────────────────