Recap
Example of survival analysis
Obtain summary statistics and produce useful graphs (continued)
More material at https://alecri.github.io/courses/getStartedR_uu.html
February 28, 2018
Recap
Example of survival analysis
Obtain summary statistics and produce useful graphs (continued)
More material at https://alecri.github.io/courses/getStartedR_uu.html
Install R and Rstudio
Install and load packages
install.packages("tidyverse")
library(tidyverse)
orca = read.table("http://www.stats4life.se/data/oralca.txt")
5 verbs for data manipulation (mutate(), select(), filter(), arrange(), summarise()) and combining them with the %>% (pipe) operator
Start producing summary statistics and useful graphs
library(plotly)
ggplotly(
ggplot(orca, aes(x = id, y = time)) + geom_linerange(aes(ymin = 0, ymax = time)) +
geom_point(aes(shape = event, color = event)) + scale_shape_manual(values = c(1, 3, 4)) +
labs(y = "Time (years)", x = "Subject ID") + coord_flip() + theme_classic()
)
The Surv() function creates a survival object, i.e. the time variable and the failure or status indicator
library(survival) orca <- mutate(orca, dead = (event != "Alive")) su_obj = Surv(orca$time, orca$dead) head(su_obj, n = 20) [1] 5.081+ 0.419 7.915 2.480 2.500 0.167 5.925+ 1.503 13.333 7.666+ 2.834+ 0.914 3.083 4.668 [15] 4.816 2.166 5.837 0.162 3.666 0.504
The created survival object is then used as response variable in specific functions for survival analysis.
The survfit() function by the default estimate then survival curve using the Kaplan–Meier estimator
fit_km = survfit(su_obj ~ 1, data = orca)
print(fit_km, print.rmean = TRUE)
Call: survfit(formula = su_obj ~ 1, data = orca)
n events *rmean *se(rmean) median 0.95LCL 0.95UCL
338.000 229.000 8.060 0.465 5.418 4.331 6.916
* restricted mean with upper limit = 23.3
The fortify() function of the ggfortify package is useful to extract the whole survival table in a data.frame.
library(ggfortify) fortify(fit_km) %>% head() time n.risk n.event n.censor surv std.err upper lower 1 0.085 338 2 0 0.9940828 0.004196498 1.0000000 0.9859401 2 0.162 336 2 0 0.9881657 0.005952486 0.9997618 0.9767041 3 0.167 334 4 0 0.9763314 0.008468952 0.9926726 0.9602592 4 0.170 330 2 0 0.9704142 0.009497400 0.9886472 0.9525175 5 0.246 328 1 0 0.9674556 0.009976176 0.9865584 0.9487228 6 0.249 327 1 0 0.9644970 0.010435745 0.9844277 0.9449699
Change the argument fun = events for the cumulative proportion, cumhaz for the cumulative hazard, cloglog for the complementary log−log
library(survminer)
ggsurvplot(fit_km, risk.table = TRUE, fun = "pct", censor = T,
xlab = "Time (years)", palette = "black", legend = "none")
mc = data.frame(q = c(.25, .5, .75), km = quantile(fit_km))
mc
q km.quantile km.lower km.upper
25 0.25 1.333 1.084 1.834
50 0.50 5.418 4.331 6.916
75 0.75 13.673 11.748 16.580
ggsurvplot(fit_km, xlab = "Time (years)", censor = F, legend = "none",
surv.median.line = "hv")
Check ?flexsurvreg for a description of the different distributions
library(flexsurv)
fit_wei = flexsurvreg(su_obj ~ 1, data = orca, dist = "weibull")
fit_wei
Call:
flexsurvreg(formula = su_obj ~ 1, data = orca, dist = "weibull")
Estimates:
est L95% U95% se
shape 0.821 0.737 0.914 0.045
scale 8.426 7.193 9.871 0.680
N = 338, Events: 229, Censored: 109
Total time at risk: 1913.673
Log-likelihood = -708.0542, df = 2
AIC = 1420.108
data.frame(summary(fit_wei)) %>% head() time est lcl ucl 1 0.085 0.9772491 0.9661026 0.9850261 2 0.162 0.9616852 0.9462490 0.9734671 3 0.167 0.9607367 0.9450804 0.9727476 4 0.170 0.9601706 0.9443845 0.9723172 5 0.246 0.9464460 0.9275947 0.9617109 6 0.249 0.9459254 0.9269537 0.9613017
library(gridExtra)
grid.arrange(
ggplot(data.frame(summary(fit_wei)), aes(time, est)) +
geom_line() + labs(x = "Time (years)", y = "Survival"),
ggplot(data.frame(summary(fit_wei, type = "hazard")), aes(time, est)) +
geom_line() + labs(x = "Time (years)", y = "Hazard"), ncol = 2
)
su_stg = survfit(su_obj ~ stage, data = orca)
su_stg
Call: survfit(formula = su_obj ~ stage, data = orca)
n events median 0.95LCL 0.95UCL
stage=I 50 25 10.56 6.17 NA
stage=II 77 51 7.92 4.92 13.34
stage=III 72 51 7.41 3.92 9.90
stage=IV 68 57 2.00 1.08 4.82
stage=unkn 71 45 3.67 2.83 8.17
survdiff(su_obj ~ stage, data = orca)
Call:
survdiff(formula = su_obj ~ stage, data = orca)
N Observed Expected (O-E)^2/E (O-E)^2/V
stage=I 50 25 39.9 5.573 6.813
stage=II 77 51 63.9 2.606 3.662
stage=III 72 51 54.1 0.174 0.231
stage=IV 68 57 33.2 16.966 20.103
stage=unkn 71 45 37.9 1.346 1.642
Chisq= 27.2 on 4 degrees of freedom, p= 1.78e-05
Change the argument fun = events to compare cumulative proportions, cumhaz for the cumulative hazards, cloglog for the complementary log−log
ggsurvplot(su_stg, fun = "event", censor = F, xlab = "Time (years)")
library(Epi)
m1 = coxph(su_obj ~ sex + I((age-65)/10) + stage, data = orca)
ci.exp(m1)
exp(Est.) 2.5% 97.5%
sexMale 1.421036 1.0770928 1.874809
I((age - 65)/10) 1.515929 1.3572535 1.693156
stageII 1.035537 0.6385626 1.679298
stageIII 1.412619 0.8727664 2.286398
stageIV 2.423992 1.5063276 3.900703
stageunkn 1.793932 1.0963386 2.935398
# check proportionallity of the hazard
cox.zph(m1)
rho chisq p
sexMale -0.00137 0.000439 0.983
I((age - 65)/10) 0.07539 1.393597 0.238
stageII -0.04208 0.411652 0.521
stageIII -0.06915 1.083755 0.298
stageIV -0.10044 2.301780 0.129
stageunkn -0.09663 2.082042 0.149
GLOBAL NA 4.895492 0.557
newd = data.frame(sex = "Male", age = 40, stage = "IV") fortify(survfit(m1, newdata = newd)) %>% ggplot(aes(x = time, y = surv)) + geom_step() + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .2) + labs(x = "Time (years)", y = "Survival probability") + theme_classic()
m2 = flexsurvreg(su_obj ~ sex + I((age-65)/10) + stage, data = orca, dist = "weibull")
m2
Call:
flexsurvreg(formula = su_obj ~ sex + I((age - 65)/10) + stage,
data = orca, dist = "weibull")
Estimates:
data mean est L95% U95% se exp(est) L95% U95%
shape NA 0.9181 0.8272 1.0191 0.0489 NA NA NA
scale NA 14.5312 9.1847 22.9898 3.4012 NA NA NA
sexMale 0.5503 -0.4276 -0.7283 -0.1269 0.1534 0.6521 0.4827 0.8808
I((age - 65)/10) -0.1486 -0.4613 -0.5817 -0.3409 0.0614 0.6305 0.5590 0.7111
stageII 0.2278 -0.0144 -0.5397 0.5109 0.2680 0.9857 0.5829 1.6668
stageIII 0.2130 -0.3538 -0.8783 0.1706 0.2676 0.7020 0.4155 1.1860
stageIV 0.2012 -0.9674 -1.4889 -0.4458 0.2661 0.3801 0.2256 0.6403
stageunkn 0.2101 -0.6513 -1.1863 -0.1163 0.2730 0.5214 0.3053 0.8902
N = 338, Events: 229, Censored: 109
Total time at risk: 1913.673
Log-likelihood = -662.2349, df = 8
AIC = 1340.47
newd = data_frame(sex = "Male", age = 40, stage = "IV") data.frame(unname(summary(m2, newdata = newd))) %>% ggplot(aes(x = time, y = est)) + geom_step() + geom_ribbon(aes(ymin = lcl, ymax = ucl), alpha = .2) + labs(x = "Time (years)", y = "Survival probability") + theme_classic()
# first 6 observations head(orca$age) [1] 65.42274 83.08783 52.59008 77.08630 80.33622 82.58132
# summary statistics (range, quartiles, and mean) summary(orca$age) Min. 1st Qu. Median Mean 3rd Qu. Max. 15.15 53.24 64.86 63.51 74.29 92.24
# save sample size, mean, median, and standard deviation in a new object
stat_age = c(n = length(orca$age), mean = mean(orca$age),
median = median(orca$age), sd = sd(orca$age))
stat_age
n mean median sd
338.00000 63.51398 64.85735 14.43288
Most common graphical presentation: histogram (?geom_histogram) and …
ggplot(orca, aes(age)) + geom_histogram()
ggplot(orca, aes(x = time, y = ..density..)) + geom_histogram(bins = 25) + geom_line(stat = "density")
… box plot (?geom_boxplot)
ggplot(orca, aes(x = 1, y = age)) + geom_boxplot() + labs(x = "", y = "Age (yr)") + theme_classic()
Table of frequencies are the typical way to summarize a categorical variable
# summary(orca$event)
tab = table(orca$event)
tab
Alive Oral ca. death Other death
109 122 107
# for proportions
round(prop.table(tab), 2)
Alive Oral ca. death Other death
0.32 0.36 0.32
Most common graphical presentation: bar plot (?geom_bar)
ggplot(orca, aes(event)) + geom_bar()
library(scales) ggplot(orca, aes(event)) + geom_bar(aes(y = ..count../sum(..count..))) + labs(x = "", y = "Frequency") + scale_y_continuous(labels = percent)
table(orca$stage) I II III IV unkn 50 77 72 68 71
orca %>% group_by(stage) %>% summarize(mean = mean(age), se = sd(age)) # A tibble: 5 x 3 stage mean se <fct> <dbl> <dbl> 1 I 62.9 14.9 2 II 64.6 14.6 3 III 61.7 13.6 4 IV 64.2 13.8 5 unkn 63.9 15.5
Graphical presentation: box plot
ggplot(orca, aes(x = stage, y = age)) + geom_boxplot() + coord_flip()
mod = lm(age ~ stage, data = orca)
summary(mod)
Call:
lm(formula = age ~ stage, data = orca)
Residuals:
Min 1Q Median 3Q Max
-47.73 -10.22 1.13 10.45 28.30
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.881 2.047 30.712 <2e-16 ***
stageII 1.749 2.630 0.665 0.507
stageIII -1.189 2.665 -0.446 0.656
stageIV 1.324 2.697 0.491 0.624
stageunkn 1.053 2.673 0.394 0.694
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 14.48 on 333 degrees of freedom
Multiple R-squared: 0.005698, Adjusted R-squared: -0.006245
F-statistic: 0.4771 on 4 and 333 DF, p-value: 0.7526
Graphically compare the distributions
ggplot(orca, aes(x = age, col = stage)) + geom_line(stat = "density")
ggplot(orca, aes(x = age, group = stage)) + geom_line(stat = "density") + facet_grid(stage ~ .)
# create binary variable 'dead' (0 alive 1 dead)
orca = mutate(orca, dead = factor(event != "Alive", labels = c("Alive", "Dead")))
tab2 = with(orca, table(sex, dead))
addmargins(tab2)
dead
sex Alive Dead Sum
Female 50 102 152
Male 59 127 186
Sum 109 229 338
# get proportion (margin: 1 by row, 2 by col)
100*prop.table(tab2, margin = 2)
dead
sex Alive Dead
Female 45.87156 44.54148
Male 54.12844 55.45852
# Test the association
chisq.test(tab2)
Pearson's Chi-squared test with Yates' continuity correction
data: tab2
X-squared = 0.012725, df = 1, p-value = 0.9102
Check out the Epi package
library(Epi)
stat.table(list(sex, dead),
list(count(), percent(sex)),
orca, margin = T)
---------------------------------
----------dead-----------
sex Alive Dead Total
---------------------------------
Female 50 102 152
45.9 44.5 45.0
Male 59 127 186
54.1 55.5 55.0
Total 109 229 338
100.0 100.0 100.0
---------------------------------
with(orca, twoby2(sex, relevel(dead, 2)))
2 by 2 table analysis:
------------------------------------------------------
Outcome : Dead
Comparing : Female vs. Male
Dead Alive P(Dead) 95% conf. interval
Female 102 50 0.6711 0.5926 0.7410
Male 127 59 0.6828 0.6125 0.7456
95% conf. interval
Relative Risk: 0.9828 0.8474 1.1399
Sample Odds Ratio: 0.9477 0.5994 1.4984
Conditional MLE Odds Ratio: 0.9479 0.5845 1.5396
Probability difference: -0.0117 -0.1118 0.0870
Exact P-value: 0.9069
Asymptotic P-value: 0.8183
------------------------------------------------------
Or alternatively with the epitools package
library(epitools) with(orca, epitab(table(dead, sex), verbose = T))
Graphical presentation: bar plot
ggplot(orca, aes(x = dead, group = sex)) +
geom_bar(aes(y = ..prop.., fill = sex),
position = "dodge") +
scale_y_continuous(labels = percent)
ggplot(orca, aes(x = dead, group = sex)) + geom_bar(aes(y = ..prop..), position = "dodge") + facet_grid(~ sex) + scale_y_continuous(labels = percent)
If time to event was not of interest (not this case)
mod_bin = glm(dead ~ relevel(sex, 2), data = orca, family = "binomial")
# presenting odds ratio
ci.exp(mod_bin)
exp(Est.) 2.5% 97.5%
(Intercept) 2.1525424 1.580672 2.931309
relevel(sex, 2)Female 0.9477165 0.599421 1.498390
# predicted probabilities orca %>% mutate(prob = predict(mod_bin, type = "response")) %>% head() id sex age stage time event dead prob 1 1 Male 65.42274 unkn 5.081 Alive Alive 0.6827957 2 2 Female 83.08783 III 0.419 Oral ca. death Dead 0.6710526 3 3 Male 52.59008 II 7.915 Other death Dead 0.6827957 4 4 Male 77.08630 I 2.480 Other death Dead 0.6827957 5 5 Male 80.33622 IV 2.500 Oral ca. death Dead 0.6827957 6 6 Female 82.58132 IV 0.167 Other death Dead 0.6710526
orca %>% group_by(sex, dead) %>% summarise(mean = mean(age), sd = sd(age)) # A tibble: 4 x 4 # Groups: sex [?] sex dead mean sd <fct> <fct> <dbl> <dbl> 1 Female Alive 61.1 14.2 2 Female Dead 71.2 12.7 3 Male Alive 51.9 14.1 4 Male Dead 63.6 12.0
stat.table(index = list(sex, dead),
contents = list(mean(age), sd(age)),
data = orca, margins = T)
---------------------------------
----------dead-----------
sex Alive Dead Total
---------------------------------
Female 61.12 71.21 67.89
14.23 12.75 14.04
Male 51.94 63.65 59.94
14.07 12.00 13.78
Total 56.15 67.02 63.51
14.81 12.88 14.43
---------------------------------
ggplot(orca, aes(x = age, col = dead)) + geom_line(stat = "density") + facet_grid(~ sex)