## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## The following object is masked from 'package:purrr':
## 
##     keep
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith

Rによる生存時間解析 下記サイトを元に勉強

https://zenn.dev/masahiro_kondo/articles/a71d9cbba294fa

#変数の作成
th1 <- prostate %>% 
  mutate(rx_c = case_when(
    rx == "placebo" ~ 0,
    rx == "0.2 mg estrogen" ~ 1,
    rx == "1.0 mg estrogen" ~ 2,
    rx == "5.0 mg estrogen" ~ 3
  )) %>% 
  mutate(rx_c2 = case_when(
    rx == "placebo" ~ 0,
    rx == "0.2 mg estrogen" ~ 1,
    rx == "1.0 mg estrogen" ~ 1,
    rx == "5.0 mg estrogen" ~ 1
  )) %>% 
  mutate(status2 = case_when(
    status == "alive" ~ 0,
    status == "dead - prostatic ca" ~ 1,
    status %in% c("dead - heart or vascular","dead - cerebrovascular") ~ 2,
    TRUE ~ 3
  )) %>% 
  mutate(status3 =
           ifelse(status2==0,0,
                  ifelse(status2==2,1,0))) %>% 
  mutate(status4 = 
           ifelse(status2==0,0,
                  ifelse(status2==3,1,0))) %>% 
  mutate(status5 = 
           ifelse(status2==1,1,0)) %>% 
  mutate(stage_c=as.factor(stage)) %>% 
  mutate(dtime_y=dtime/12)
th2 <- th1 %>% dplyr::select(rx_c,age,stage_c,status,dtime_y,status5) 
  
table1 <- tbl_summary(
  th2,
  by = rx_c,
  missing = "ifany",
  type = list(c(age, dtime_y) ~ "continuous")) %>% 
  add_n() %>% 
  modify_header(label = "**Variable**")
table1
Variable N 0, N = 1271 1, N = 1241 2, N = 1261 3, N = 1251
age 501 73 (70, 76) 73 (70, 75) 73 (69, 76) 73 (69, 76)
Unknown 0 0 1 0
stage_c 502
3 74 (58%) 73 (59%) 71 (56%) 71 (57%)
4 53 (42%) 51 (41%) 55 (44%) 54 (43%)
status 502
alive 32 (25%) 29 (23%) 55 (44%) 32 (26%)
dead - cerebrovascular 7 (5.5%) 6 (4.8%) 11 (8.7%) 7 (5.6%)
dead - heart or vascular 27 (21%) 19 (15%) 14 (11%) 36 (29%)
dead - other ca 3 (2.4%) 9 (7.3%) 9 (7.1%) 4 (3.2%)
dead - other specific non-ca 10 (7.9%) 9 (7.3%) 4 (3.2%) 5 (4.0%)
dead - prostatic ca 37 (29%) 42 (34%) 24 (19%) 27 (22%)
dead - pulmonary embolus 2 (1.6%) 2 (1.6%) 3 (2.4%) 7 (5.6%)
dead - respiratory disease 6 (4.7%) 4 (3.2%) 4 (3.2%) 2 (1.6%)
dead - unknown cause 1 (0.8%) 3 (2.4%) 1 (0.8%) 2 (1.6%)
dead - unspecified non-ca 2 (1.6%) 1 (0.8%) 1 (0.8%) 3 (2.4%)
dtime_y 502 2.75 (1.33, 4.83) 2.50 (1.15, 4.77) 3.46 (1.33, 4.92) 2.92 (1.00, 4.50)
status5 502 37 (29%) 42 (34%) 24 (19%) 27 (22%)
1 Median (IQR); n (%)

プラセボ群と、介入群での比較

カプランマイヤー

#KM曲線
fit <- survfit( Surv(dtime_y, status5) ~ rx_c2, data = th1 )
ggsurvplot(
  fit,                     
  data = th1,             
  risk.table = TRUE,       
  pval = TRUE,             
  conf.int = TRUE,         
  xlab = "YEAR", 
  ggtheme = theme_light(), 
  risk.table.y.text.col = T, 
  risk.table.y.text = FALSE, 
  legend.labs =
   c("Placebo", "Estrogen")
)

生存曲線のクロスをどう見るべき?

生存曲線がクロスする理由 1)奏効する部分集団と奏効しない部分集団が混在 2)途中脱落が多く,後半の差は精度不足で偶然で生じた.(時間軸の限定が必要) 3)OSで治療のクロスオーバーが大量に起きた. 4)mild(延命)な治療とintensive(治癒)な治療の比較.

引用 https://www.croit.com/wp_croit2/wp-content/uploads/2018/11/16_01.pdf

1)の例がわかりやすい 1の例

2)は、リスクテーブルから判断すべきか 3はよく分からない 4はそもそも比較すべきでないってことか

介入のクラス毎のカプランマイヤー

fit2 <- survfit( Surv(dtime_y, status5) ~ rx_c, data = th1 )

ggsurvplot(
  fit2,                     
  data = th1,             
  risk.table = TRUE,       
  pval = TRUE,             
  conf.int = FALSE,         
  xlab = "YEAR", 
  ggtheme = theme_light(), 
  risk.table.y.text.col = T, 
  risk.table.y.text = FALSE, 
  legend.labs =
    c("Placebo", "0.2 mg estrogen", "1.0 mg estrogen", "5.0 mg estrogen")
)

## 生存時間分析

survfit(Surv(dtime_y, status5) ~ rx_c2, data = th1)
## Call: survfit(formula = Surv(dtime_y, status5) ~ rx_c2, data = th1)
## 
##           n events median 0.95LCL 0.95UCL
## rx_c2=0 127     37   5.92    5.75      NA
## rx_c2=1 375     93     NA      NA      NA

何で信頼区間とメジアンが並んでいるのだろう? LCLとかUCLと書いてるが、信頼区間ではないと思う。(それとも、これも信頼区間と言うのか?) 0.95分位点じゃないの?? →違った。中央値の信頼区間(U検定を実施してると思う)

quantile(fit, c(1-0.95,1-0.5,1-0.05))
## $quantile
##                 5       50 95
## rx_c2=0 0.5833333 5.916667 NA
## rx_c2=1 0.7500000       NA NA
## 
## $lower
##                 5   50 95
## rx_c2=0 0.1666667 5.75 NA
## rx_c2=1 0.5000000   NA NA
## 
## $upper
##           5 50 95
## rx_c2=0 1.5 NA NA
## rx_c2=1 1.0 NA NA

5年生存の割合

どんな値でも信頼区間って出せるんだね、、、

#Survival probability at a certain time
summary(fit,times = 5)
## Call: survfit(formula = Surv(dtime_y, status5) ~ rx_c2, data = th1)
## 
##                 rx_c2=0 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##        5.000       28.000       35.000        0.626        0.052        0.532 
## upper 95% CI 
##        0.737 
## 
##                 rx_c2=1 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##       5.0000      81.0000      89.0000       0.6638       0.0311       0.6055 
## upper 95% CI 
##       0.7278

ログランク検定