# 准备
rm(list=ls())
setwd("D:/R/R-4.0.5/bin/project_survival")
suppressMessages(library(survival))
suppressMessages(library(plyr))
COX回归分析的是多个变量对【时间+事件】的影响
变量:可以是分类变量、等级变量和连续型变量,要把所有变量都转化数字形式
【时间+事件】:即Coxsurv
load("Base_Lung.Rdata")
str(Base_Lung)
## 'data.frame': 167 obs. of 11 variables:
## $ inst : Factor w/ 17 levels "1","2","3","4",..: 3 5 10 7 9 1 7 6 10 15 ...
## $ time : num 455 210 1022 310 361 ...
## $ status : Factor w/ 2 levels "censored","dead": 2 2 1 2 2 2 2 2 2 2 ...
## $ age : Factor w/ 2 levels ">65","<=65": 1 2 1 1 1 2 2 2 2 1 ...
## $ sex : Factor w/ 2 levels "Male","Female": 1 1 1 2 2 1 1 1 1 1 ...
## $ ph.ecog : Factor w/ 5 levels "asymptomatic",..: 1 2 2 3 3 2 3 2 2 2 ...
## $ ph.karno : num 90 90 50 70 60 70 70 80 80 90 ...
## $ pat.karno: num 90 60 80 60 80 80 70 80 70 100 ...
## $ meal.cal : num 1225 1150 513 384 538 ...
## $ wt.loss : Factor w/ 2 levels "Weight loss",..: 1 1 2 1 1 1 1 1 1 2 ...
## $ treatment: Factor w/ 2 levels "low","high": 2 2 1 1 1 1 1 1 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:61] 1 3 5 12 13 14 16 20 23 25 ...
## ..- attr(*, "names")= chr [1:61] "1" "3" "5" "12" ...
CoxLu <- Base_Lung
CoxLu[,c(1,3,4,5,6,7,10,11)] <- lapply(CoxLu[,c(1,3,4,5,6,7,10,11)],as.numeric)
str(CoxLu)
## 'data.frame': 167 obs. of 11 variables:
## $ inst : num 3 5 10 7 9 1 7 6 10 15 ...
## $ time : num 455 210 1022 310 361 ...
## $ status : num 2 2 1 2 2 2 2 2 2 2 ...
## $ age : num 1 2 1 1 1 2 2 2 2 1 ...
## $ sex : num 1 1 1 2 2 1 1 1 1 1 ...
## $ ph.ecog : num 1 2 2 3 3 2 3 2 2 2 ...
## $ ph.karno : num 90 90 50 70 60 70 70 80 80 90 ...
## $ pat.karno: num 90 60 80 60 80 80 70 80 70 100 ...
## $ meal.cal : num 1225 1150 513 384 538 ...
## $ wt.loss : num 1 1 2 1 1 1 1 1 1 2 ...
## $ treatment: num 2 2 1 1 1 1 1 1 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:61] 1 3 5 12 13 14 16 20 23 25 ...
## ..- attr(*, "names")= chr [1:61] "1" "3" "5" "12" ...
lapply()函数的应用:help | 说明
lapply(X,FUN): returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X.
sapply(X,FUN,simplify = T/F): 默认为T,返回向量或矩阵;F时相当于lapply,返回list
For lapply, sapply(simplify = FALSE) and replicate(simplify = FALSE), a list
解释
lapply() 涉及2个参数,X可以为向量、列表、表达式,总之会被强制转为list;FUN:自定义函数或直接用
lapply\sapply\vapply的区别就是输出值的不同,sapply(x,f,simplify=T)默认输出矩阵或向量
此处的难点在于coxph()的结果太复杂难以提取,这个对象的数据提取方法需要重点记忆
另外,注意提取数据装到向量里面,把向量包装成数据框的思路
#提取生存信息
Coxsurv <- Surv(CoxLu$time,CoxLu$status)#即【时间+事件】
Coxsurv
## [1] 455 210 1022+ 310 361 218 166 170 567 613 707 61
## [13] 301 81 371 520 574 118 390 12 473 26 107 53
## [25] 814 965+ 93 731 460 153 433 583 95 303 519 643
## [37] 765 53 246 689 5 687 345 444 223 60 163 65
## [49] 821+ 428 230 840+ 305 11 226 426 705 363 176 791
## [61] 95 196+ 167 806+ 284 641 147 740+ 163 655 88 245
## [73] 30 477 559+ 450 156 529+ 429 351 15 181 283 13
## [85] 212 524 288 363 199 550 54 558 207 92 60 551+
## [97] 293 353 267 511+ 457 337 201 404+ 222 62 458+ 353
## [109] 163 31 229 156 291 179 376+ 384+ 268 292+ 142 413+
## [121] 266+ 320 181 285 301+ 348 197 382+ 303+ 296+ 180 145
## [133] 269+ 300+ 284+ 292+ 332+ 285 259+ 110 286 270 225+ 269
## [145] 225+ 243+ 276+ 135 79 59 240+ 202+ 235+ 239 252+ 221+
## [157] 185+ 222+ 183 211+ 175+ 197+ 203+ 191+ 105+ 174+ 177+
CoxLu$Coxsurv <- with(CoxLu,Coxsurv)
#COX回归分析
Acox <- coxph(Coxsurv~age,data=CoxLu)#拟合回归模型
#提取HR、P、95%CI
Asum <- summary(Acox)
Asum$coefficients
## coef exp(coef) se(coef) z Pr(>|z|)
## age -0.3516327 0.7035385 0.1854763 -1.895836 0.0579817
Asum$conf.int
## exp(coef) exp(-coef) lower .95 upper .95
## age 0.7035385 1.421386 0.489114 1.011965
#把结果封装到向量里面
HR <- round(Asum$coefficients[,2],2)
P.Value <- round(Asum$coefficients[,5],3)
CI <- paste0(round(Asum$conf.int[,c(3,4)],2),collapse = "-")
#把向量整合进数据框
Unicox <- data.frame('Characteristic' = 'Age',
'Hazard Radio' = HR,
'95% CI' = CI,
'P.Value' = P.Value)
生存信息:所有生存分析所需要的最基本的数据原材料,是时间和事件的结合
拟合COX回归模型:coxph(formula,data)
Fit Proportional Hazards Regression Model:拟合比例风险模型
输入:formula=生存信息~变量;data=数据框
结果是一个我看不懂的列表,所以要学会提取结果
结果提取
首先summary一下,summary完了还是列表,但是里面的槽简单了很多
记住两个槽:系数:coefficients;可信区间:comf.int
4.一个小知识点:R里面变量名最好以字母开头,以数字开头,如95CI可能会出现问题
lapply()进行批量操作步骤: 获得X,定义FUN
#获得X
colnames(CoxLu)
## [1] "inst" "time" "status" "age" "sex" "ph.ecog"
## [7] "ph.karno" "pat.karno" "meal.cal" "wt.loss" "treatment" "Coxsurv"
cox_var <- colnames(CoxLu)[c(1,4:11)]
#定义函数
UNICOX <- function(x){
fml <- as.formula(paste0("Coxsurv~",x))
Ucox <- coxph(fml,data=CoxLu)
Usum <- summary(Ucox)
HR <- round(Usum$coefficients[,2],2)
P.Value <- round(Usum$coefficients[,5],3)
CI <- paste0(round(Usum$conf.int[,c(3,4)],2),collapse = "-")
unicox <- data.frame('Characteristic' = x,
'Hazard Ratio' = HR,
'CI95' = CI,
'P.Value' = P.Value)
return(unicox)
}
Unicox_Lu <- lapply(cox_var,UNICOX)
Unicox_Lu <- ldply(Unicox_Lu,data.frame)#转换为数据框
Unicox_Lu
## Characteristic Hazard.Ratio CI95 P.Value
## 1 inst 0.98 0.94-1.01 0.187
## 2 age 0.70 0.49-1.01 0.058
## 3 sex 0.62 0.42-0.91 0.015
## 4 ph.ecog 1.60 1.23-2.08 0.000
## 5 ph.karno 0.99 0.97-1 0.069
## 6 pat.karno 0.98 0.97-0.99 0.002
## 7 meal.cal 1.00 1-1 0.621
## 8 wt.loss 0.97 0.65-1.46 0.885
## 9 treatment 0.72 0.5-1.03 0.071
ldply() 函数:需要输入2个参数:data 需要处理的列表;fun 需要对列表中的元素进行的操作
输出:把fun 的结果进行rbind 操作形成数据框输出
fun fact: list input: l_ply(); laply(); llply() ; data.frame output: adply(), ddply(), mdply()
输入值的选择:此处的函数的输入值是colnames(字符串),然后需要在函数里面把它翻译成各种可以被应用的形式,paste() 函数是非常好的帮手
输入值是字符串,函数里面的变量是formula里面 ~后面的部分,怎么办——paste()函数
multi_var <- Unicox_Lu[Unicox_Lu$P.Value < 0.05,]$Characteristic
MULTICOX <- function(x){
MVAR <- paste(x,collapse = "+")
fml <- as.formula(paste0("Coxsurv~",MVAR))
Multicox <- coxph(fml,data=CoxLu)
Msum <- summary(Multicox)
HR <- round(Msum$coefficients[,2],2)
CI <- round(Msum$conf.int[,c(3,4)],2)
CI95 <- apply(CI,1,function(ci)paste0(ci,collapse = "-"))
P.Value <- round(Msum$coefficients[,5],3)
multicox <- data.frame("Characteristic" = x,
"Hazard Ratio" = HR,
"CI95" = CI95,
"P.value" = P.Value)
return(multicox)
}
Multicox_Lu <- MULTICOX(multi_var)
paste0(V1,V2,collapse = "-") 就行了
但是上面的代码里面,提取出来的是一个2列的数据框,直接用paste0 会全部粘在一起
最简单的方法是拆成2个向量再粘
final <- merge.data.frame(Unicox_Lu,Multicox_Lu,by = "Characteristic",
sort = T,all = T)
final
## Characteristic Hazard.Ratio.x CI95.x P.Value Hazard.Ratio.y CI95.y
## 1 age 0.70 0.49-1.01 0.058 NA <NA>
## 2 inst 0.98 0.94-1.01 0.187 NA <NA>
## 3 meal.cal 1.00 1-1 0.621 NA <NA>
## 4 pat.karno 0.98 0.97-0.99 0.002 0.99 0.98-1.01
## 5 ph.ecog 1.60 1.23-2.08 0.000 1.48 1.07-2.04
## 6 ph.karno 0.99 0.97-1 0.069 NA <NA>
## 7 sex 0.62 0.42-0.91 0.015 0.61 0.41-0.9
## 8 treatment 0.72 0.5-1.03 0.071 NA <NA>
## 9 wt.loss 0.97 0.65-1.46 0.885 NA <NA>
## P.value
## 1 NA
## 2 NA
## 3 NA
## 4 0.344
## 5 0.018
## 6 NA
## 7 0.012
## 8 NA
## 9 NA
write.csv(final,"final_Lu_cox.csv")
paste0(1:12,c("a","b","c"))
## [1] "1a" "2b" "3c" "4a" "5b" "6c" "7a" "8b" "9c" "10a" "11b" "12c"
paste0(1:12,letters[1:7])
## [1] "1a" "2b" "3c" "4d" "5e" "6f" "7g" "8a" "9b" "10c" "11d" "12e"
paste0(1:12,letters[1:12],collapse = "-")
## [1] "1a-2b-3c-4d-5e-6f-7g-8h-9i-10j-11k-12l"
paste0(1:12,letters[1:12],sep = "-")
## [1] "1a-" "2b-" "3c-" "4d-" "5e-" "6f-" "7g-" "8h-" "9i-" "10j-"
## [11] "11k-" "12l-"
paste(1:12,letters[1:12],sep = "-")
## [1] "1-a" "2-b" "3-c" "4-d" "5-e" "6-f" "7-g" "8-h" "9-i" "10-j"
## [11] "11-k" "12-l"
paste(1:12,letters[1:12],collapse = "-")
## [1] "1 a-2 b-3 c-4 d-5 e-6 f-7 g-8 h-9 i-10 j-11 k-12 l"
paste0(1:12,letters[1:12])
## [1] "1a" "2b" "3c" "4d" "5e" "6f" "7g" "8h" "9i" "10j" "11k" "12l"
paste(1:12,letters[1:12])
## [1] "1 a" "2 b" "3 c" "4 d" "5 e" "6 f" "7 g" "8 h" "9 i" "10 j"
## [11] "11 k" "12 l"
paste() :输入:
… 任何东西基本上都可以粘,长短相同就一一对应,参差不齐就自动循环
collapse= 得到结果是一个长字符串,粘完后用符号连接
sep= 粘完后不连,主要配合paste使用,表示用什么粘,paste0 直接粘
输出:
默认粘完不连,输出字符串向量,paste中间有空格(默认sep = " "),paste0中间没有空格
加上collapse 参数,输出1个长字符串