Get started with R (Part 2)

February 28, 2018

Outline

Recap
Example of survival analysis
Obtain summary statistics and produce useful graphs (continued)

More material at https://alecri.github.io/courses/getStartedR_uu.html

Recap

I föregående avsnitt

Install R and Rstudio
Install and load packages

install.packages("tidyverse")
library(tidyverse)

Motivating example

orca = read.table("http://www.stats4life.se/data/oralca.txt")

5 verbs for data manipulation (mutate(), select(), filter(), arrange(), summarise()) and combining them with the %>% (pipe) operator
Start producing summary statistics and useful graphs

Example of survival analysis (more material can be found here

Representing survival data

library(plotly)
ggplotly(
  ggplot(orca, aes(x = id, y = time)) + geom_linerange(aes(ymin = 0, ymax = time)) +
    geom_point(aes(shape = event, color = event)) + scale_shape_manual(values = c(1, 3, 4)) +
    labs(y = "Time (years)", x = "Subject ID") + coord_flip() + theme_classic()
)

Creating a survival object

The Surv() function creates a survival object, i.e. the time variable and the failure or status indicator

library(survival)
orca <- mutate(orca, dead = (event != "Alive"))
su_obj = Surv(orca$time, orca$dead)
head(su_obj, n = 20)
 [1]  5.081+  0.419   7.915   2.480   2.500   0.167   5.925+  1.503  13.333   7.666+  2.834+  0.914   3.083   4.668 
[15]  4.816   2.166   5.837   0.162   3.666   0.504

The created survival object is then used as response variable in specific functions for survival analysis.

Estimating the Survival Function (Kaplan–Meier)

The survfit() function by the default estimate then survival curve using the Kaplan–Meier estimator

fit_km = survfit(su_obj ~ 1, data = orca)
print(fit_km, print.rmean = TRUE)
Call: survfit(formula = su_obj ~ 1, data = orca)

         n     events     *rmean *se(rmean)     median    0.95LCL    0.95UCL 
   338.000    229.000      8.060      0.465      5.418      4.331      6.916 
    * restricted mean with upper limit =  23.3

The fortify() function of the ggfortify package is useful to extract the whole survival table in a data.frame.

library(ggfortify)
fortify(fit_km) %>% head()
   time n.risk n.event n.censor      surv     std.err     upper     lower
1 0.085    338       2        0 0.9940828 0.004196498 1.0000000 0.9859401
2 0.162    336       2        0 0.9881657 0.005952486 0.9997618 0.9767041
3 0.167    334       4        0 0.9763314 0.008468952 0.9926726 0.9602592
4 0.170    330       2        0 0.9704142 0.009497400 0.9886472 0.9525175
5 0.246    328       1        0 0.9674556 0.009976176 0.9865584 0.9487228
6 0.249    327       1        0 0.9644970 0.010435745 0.9844277 0.9449699

Change the argument fun = events for the cumulative proportion, cumhaz for the cumulative hazard, cloglog for the complementary log−log

library(survminer)
ggsurvplot(fit_km, risk.table = TRUE, fun = "pct", censor = T, 
           xlab = "Time (years)", palette = "black", legend = "none")

Measures of central tendency

mc = data.frame(q = c(.25, .5, .75), km = quantile(fit_km))
mc
      q km.quantile km.lower km.upper
25 0.25       1.333    1.084    1.834
50 0.50       5.418    4.331    6.916
75 0.75      13.673   11.748   16.580

ggsurvplot(fit_km, xlab = "Time (years)", censor = F, legend = "none",
           surv.median.line = "hv")

Estimating the Survival Function (parametric)

Check ?flexsurvreg for a description of the different distributions

library(flexsurv)
fit_wei = flexsurvreg(su_obj ~ 1, data = orca, dist = "weibull")
fit_wei
Call:
flexsurvreg(formula = su_obj ~ 1, data = orca, dist = "weibull")

Estimates: 
       est    L95%   U95%   se   
shape  0.821  0.737  0.914  0.045
scale  8.426  7.193  9.871  0.680

N = 338,  Events: 229,  Censored: 109
Total time at risk: 1913.673
Log-likelihood = -708.0542, df = 2
AIC = 1420.108

data.frame(summary(fit_wei)) %>% head()
   time       est       lcl       ucl
1 0.085 0.9772491 0.9661026 0.9850261
2 0.162 0.9616852 0.9462490 0.9734671
3 0.167 0.9607367 0.9450804 0.9727476
4 0.170 0.9601706 0.9443845 0.9723172
5 0.246 0.9464460 0.9275947 0.9617109
6 0.249 0.9459254 0.9269537 0.9613017

library(gridExtra)
grid.arrange(
  ggplot(data.frame(summary(fit_wei)), aes(time, est)) + 
    geom_line() + labs(x = "Time (years)", y = "Survival"),
  ggplot(data.frame(summary(fit_wei, type = "hazard")), aes(time, est)) + 
    geom_line() + labs(x = "Time (years)", y = "Hazard"), ncol = 2
)

Comparison of survival curves

su_stg  = survfit(su_obj ~ stage, data = orca)
su_stg
Call: survfit(formula = su_obj ~ stage, data = orca)

            n events median 0.95LCL 0.95UCL
stage=I    50     25  10.56    6.17      NA
stage=II   77     51   7.92    4.92   13.34
stage=III  72     51   7.41    3.92    9.90
stage=IV   68     57   2.00    1.08    4.82
stage=unkn 71     45   3.67    2.83    8.17

survdiff(su_obj ~ stage, data = orca)
Call:
survdiff(formula = su_obj ~ stage, data = orca)

            N Observed Expected (O-E)^2/E (O-E)^2/V
stage=I    50       25     39.9     5.573     6.813
stage=II   77       51     63.9     2.606     3.662
stage=III  72       51     54.1     0.174     0.231
stage=IV   68       57     33.2    16.966    20.103
stage=unkn 71       45     37.9     1.346     1.642

 Chisq= 27.2  on 4 degrees of freedom, p= 1.78e-05

Change the argument fun = events to compare cumulative proportions, cumhaz for the cumulative hazards, cloglog for the complementary log−log

ggsurvplot(su_stg, fun = "event", censor = F, xlab = "Time (years)")

Modeling survival data (Cox PH model)

library(Epi)
m1 = coxph(su_obj ~ sex + I((age-65)/10) + stage, data = orca)
ci.exp(m1)
                 exp(Est.)      2.5%    97.5%
sexMale           1.421036 1.0770928 1.874809
I((age - 65)/10)  1.515929 1.3572535 1.693156
stageII           1.035537 0.6385626 1.679298
stageIII          1.412619 0.8727664 2.286398
stageIV           2.423992 1.5063276 3.900703
stageunkn         1.793932 1.0963386 2.935398

# check proportionallity of the hazard
cox.zph(m1)
                      rho    chisq     p
sexMale          -0.00137 0.000439 0.983
I((age - 65)/10)  0.07539 1.393597 0.238
stageII          -0.04208 0.411652 0.521
stageIII         -0.06915 1.083755 0.298
stageIV          -0.10044 2.301780 0.129
stageunkn        -0.09663 2.082042 0.149
GLOBAL                 NA 4.895492 0.557

newd = data.frame(sex = "Male", age = 40, stage = "IV")
fortify(survfit(m1, newdata = newd)) %>%
  ggplot(aes(x = time, y = surv)) +
  geom_step() + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = .2) +
  labs(x = "Time (years)", y = "Survival probability") + theme_classic()

Modeling survival data (AFT model)

m2 = flexsurvreg(su_obj ~ sex + I((age-65)/10) + stage, data = orca, dist = "weibull")
m2
Call:
flexsurvreg(formula = su_obj ~ sex + I((age - 65)/10) + stage, 
    data = orca, dist = "weibull")

Estimates: 
                  data mean  est      L95%     U95%     se       exp(est)  L95%     U95%   
shape                  NA     0.9181   0.8272   1.0191   0.0489       NA        NA       NA
scale                  NA    14.5312   9.1847  22.9898   3.4012       NA        NA       NA
sexMale            0.5503    -0.4276  -0.7283  -0.1269   0.1534   0.6521    0.4827   0.8808
I((age - 65)/10)  -0.1486    -0.4613  -0.5817  -0.3409   0.0614   0.6305    0.5590   0.7111
stageII            0.2278    -0.0144  -0.5397   0.5109   0.2680   0.9857    0.5829   1.6668
stageIII           0.2130    -0.3538  -0.8783   0.1706   0.2676   0.7020    0.4155   1.1860
stageIV            0.2012    -0.9674  -1.4889  -0.4458   0.2661   0.3801    0.2256   0.6403
stageunkn          0.2101    -0.6513  -1.1863  -0.1163   0.2730   0.5214    0.3053   0.8902

N = 338,  Events: 229,  Censored: 109
Total time at risk: 1913.673
Log-likelihood = -662.2349, df = 8
AIC = 1340.47

newd = data_frame(sex = "Male", age = 40, stage = "IV")
data.frame(unname(summary(m2, newdata = newd))) %>%
  ggplot(aes(x = time, y = est)) +
  geom_step() + geom_ribbon(aes(ymin = lcl, ymax = ucl), alpha = .2) +
  labs(x = "Time (years)", y = "Survival probability") + 
  theme_classic()

Obtain summary statistics and produce useful graphs

Quantitative variable (age at diagnosis)

# first 6 observations
head(orca$age)
[1] 65.42274 83.08783 52.59008 77.08630 80.33622 82.58132

# summary statistics (range, quartiles, and mean)
summary(orca$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  15.15   53.24   64.86   63.51   74.29   92.24

# save sample size, mean, median, and standard deviation in a new object
stat_age = c(n = length(orca$age), mean = mean(orca$age), 
               median = median(orca$age), sd = sd(orca$age))
stat_age
        n      mean    median        sd 
338.00000  63.51398  64.85735  14.43288

Most common graphical presentation: histogram (?geom_histogram) and …

ggplot(orca, aes(age)) +
  geom_histogram()

ggplot(orca, aes(x = time, y = ..density..)) +
  geom_histogram(bins = 25) +
  geom_line(stat = "density")

… box plot (?geom_boxplot)

ggplot(orca, aes(x = 1, y = age)) + geom_boxplot() +
  labs(x = "", y = "Age (yr)") + theme_classic()

Categorical variable (event)

Table of frequencies are the typical way to summarize a categorical variable

# summary(orca$event)
tab = table(orca$event)
tab

         Alive Oral ca. death    Other death 
           109            122            107

# for proportions
round(prop.table(tab), 2)

         Alive Oral ca. death    Other death 
          0.32           0.36           0.32

Most common graphical presentation: bar plot (?geom_bar)

ggplot(orca, aes(event)) + geom_bar()

library(scales)
ggplot(orca, aes(event)) + 
  geom_bar(aes(y = ..count../sum(..count..))) +
  labs(x = "", y = "Frequency") +
  scale_y_continuous(labels = percent)

Bivariate association (age and stage)

table(orca$stage)

   I   II  III   IV unkn 
  50   77   72   68   71

orca %>% 
  group_by(stage) %>%
  summarize(mean = mean(age), se = sd(age))
# A tibble: 5 x 3
  stage  mean    se
  <fct> <dbl> <dbl>
1 I      62.9  14.9
2 II     64.6  14.6
3 III    61.7  13.6
4 IV     64.2  13.8
5 unkn   63.9  15.5

Graphical presentation: box plot

ggplot(orca, aes(x = stage, y = age)) + 
  geom_boxplot() + coord_flip()

mod = lm(age ~ stage, data = orca)
summary(mod)

Call:
lm(formula = age ~ stage, data = orca)

Residuals:
   Min     1Q Median     3Q    Max 
-47.73 -10.22   1.13  10.45  28.30 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.881      2.047  30.712   <2e-16 ***
stageII        1.749      2.630   0.665    0.507    
stageIII      -1.189      2.665  -0.446    0.656    
stageIV        1.324      2.697   0.491    0.624    
stageunkn      1.053      2.673   0.394    0.694    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 14.48 on 333 degrees of freedom
Multiple R-squared:  0.005698,  Adjusted R-squared:  -0.006245 
F-statistic: 0.4771 on 4 and 333 DF,  p-value: 0.7526

Graphically compare the distributions

ggplot(orca, aes(x = age, col = stage)) +
  geom_line(stat = "density")

ggplot(orca, aes(x = age, group = stage)) +
  geom_line(stat = "density") +
  facet_grid(stage ~ .)

Bivariate association (sex and dead)

# create binary variable 'dead' (0 alive 1 dead)
orca = mutate(orca, dead = factor(event != "Alive", labels = c("Alive", "Dead")))
tab2 = with(orca, table(sex, dead))
addmargins(tab2)
        dead
sex      Alive Dead Sum
  Female    50  102 152
  Male      59  127 186
  Sum      109  229 338

# get proportion (margin: 1 by row, 2 by col)
100*prop.table(tab2, margin = 2)
        dead
sex         Alive     Dead
  Female 45.87156 44.54148
  Male   54.12844 55.45852

# Test the association
chisq.test(tab2)

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab2
X-squared = 0.012725, df = 1, p-value = 0.9102

Check out the Epi package

library(Epi)
stat.table(list(sex, dead), 
           list(count(), percent(sex)), 
           orca, margin = T)
 --------------------------------- 
         ----------dead----------- 
 sex        Alive    Dead   Total  
 --------------------------------- 
 Female        50     102     152  
             45.9    44.5    45.0  
                                   
 Male          59     127     186  
             54.1    55.5    55.0  
                                   
                                   
 Total        109     229     338  
            100.0   100.0   100.0  
 ---------------------------------

with(orca, twoby2(sex, relevel(dead, 2)))
2 by 2 table analysis: 
------------------------------------------------------ 
Outcome   : Dead 
Comparing : Female vs. Male 

       Dead Alive    P(Dead) 95% conf. interval
Female  102    50     0.6711    0.5926   0.7410
Male    127    59     0.6828    0.6125   0.7456

                                    95% conf. interval
             Relative Risk:  0.9828    0.8474   1.1399
         Sample Odds Ratio:  0.9477    0.5994   1.4984
Conditional MLE Odds Ratio:  0.9479    0.5845   1.5396
    Probability difference: -0.0117   -0.1118   0.0870

             Exact P-value: 0.9069 
        Asymptotic P-value: 0.8183 
------------------------------------------------------

Or alternatively with the epitools package

library(epitools)
with(orca, epitab(table(dead, sex), verbose = T))

Graphical presentation: bar plot

ggplot(orca, aes(x = dead, group = sex)) + 
  geom_bar(aes(y = ..prop.., fill = sex), 
           position = "dodge") + 
  scale_y_continuous(labels = percent)

ggplot(orca, aes(x = dead, group = sex)) + 
  geom_bar(aes(y = ..prop..), position = "dodge") +
  facet_grid(~ sex) +
  scale_y_continuous(labels = percent)

If time to event was not of interest (not this case)

mod_bin = glm(dead ~ relevel(sex, 2), data = orca, family = "binomial")
# presenting odds ratio
ci.exp(mod_bin)
                      exp(Est.)     2.5%    97.5%
(Intercept)           2.1525424 1.580672 2.931309
relevel(sex, 2)Female 0.9477165 0.599421 1.498390

# predicted probabilities
orca %>%
  mutate(prob = predict(mod_bin, type = "response")) %>%
  head()
  id    sex      age stage  time          event  dead      prob
1  1   Male 65.42274  unkn 5.081          Alive Alive 0.6827957
2  2 Female 83.08783   III 0.419 Oral ca. death  Dead 0.6710526
3  3   Male 52.59008    II 7.915    Other death  Dead 0.6827957
4  4   Male 77.08630     I 2.480    Other death  Dead 0.6827957
5  5   Male 80.33622    IV 2.500 Oral ca. death  Dead 0.6827957
6  6 Female 82.58132    IV 0.167    Other death  Dead 0.6710526

Multivariate association (sex, age, and dead)

orca %>%
  group_by(sex, dead) %>%
  summarise(mean = mean(age), sd = sd(age))
# A tibble: 4 x 4
# Groups:   sex [?]
  sex    dead   mean    sd
  <fct>  <fct> <dbl> <dbl>
1 Female Alive  61.1  14.2
2 Female Dead   71.2  12.7
3 Male   Alive  51.9  14.1
4 Male   Dead   63.6  12.0

stat.table(index = list(sex, dead),
           contents = list(mean(age), sd(age)),
           data = orca, margins = T)
 --------------------------------- 
         ----------dead----------- 
 sex        Alive    Dead   Total  
 --------------------------------- 
 Female     61.12   71.21   67.89  
            14.23   12.75   14.04  
                                   
 Male       51.94   63.65   59.94  
            14.07   12.00   13.78  
                                   
                                   
 Total      56.15   67.02   63.51  
            14.81   12.88   14.43  
 ---------------------------------

ggplot(orca, aes(x = age, col = dead)) + 
  geom_line(stat = "density") +
  facet_grid(~ sex)