Examples of using the broom package

This is an adaptation of the code I used in the Portland PDXR meetup.

Visualization

library(survival)
head(lung)
##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0
surv_fit <- survfit(coxph(Surv(time, status) ~ age + sex, lung))

surv_fit
## Call: survfit(formula = coxph(Surv(time, status) ~ age + sex, lung))
## 
##       n  events  median 0.95LCL 0.95UCL 
##     228     165     320     285     363
library(broom)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)

td <- tidy(surv_fit)

ggplot(td, aes(time, estimate)) +
  geom_line() +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
              alpha = .5) +
  scale_y_continuous(labels = percent_format()) +
  ylab("Predicted survival")

Nest, model, tidy, unnest

library(tidyr)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:scales':
## 
##     discard
## The following objects are masked from 'package:dplyr':
## 
##     contains, order_by
library(broom)
ggplot(Orange, aes(age, circumference)) +
  geom_point()

lm(circumference ~ age, Orange)
## 
## Call:
## lm(formula = circumference ~ age, data = Orange)
## 
## Coefficients:
## (Intercept)          age  
##     17.3997       0.1068

Tidying coefficients:

models <- Orange %>%
  nest(-Tree) %>%
  mutate(mod = map(data, ~ lm(circumference ~ age, data = .)))

coefficients <- models %>%
  mutate(coefs = map(mod, tidy, conf.int = TRUE)) %>%
  unnest(coefs)

coefficients %>%
  filter(term == "age") %>%
  ggplot(aes(estimate, Tree)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  xlab("Growth rate in mm/day")

Tidying residuals with augment:

obs <- models %>%
  mutate(obs = map(mod, augment)) %>%
  unnest(obs)

ggplot(obs, aes(age, .resid, color = Tree)) +
  geom_point()

Bootstrap

Recall the tidy survival curve:

library(modelr)
## 
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
## 
##     bootstrap
library(purrr)
library(tidyr)

head(lung)
##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0
surv_fit <- survfit(coxph(Surv(time, status) ~ age + sex, lung))

td <- tidy(surv_fit)

ggplot(td, aes(time, estimate)) +
  geom_line() +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
              alpha = .5) +
  scale_y_continuous(labels = percent_format()) +
  ylab("Predicted survival")

We can use the bootstrap function from the modelr package to perform this many times:

library(modelr)

lung %>%
  bootstrap(100)
## # A tibble: 100 x 2
##             strap   .id
##            <list> <chr>
## 1  <S3: resample>   001
## 2  <S3: resample>   002
## 3  <S3: resample>   003
## 4  <S3: resample>   004
## 5  <S3: resample>   005
## 6  <S3: resample>   006
## 7  <S3: resample>   007
## 8  <S3: resample>   008
## 9  <S3: resample>   009
## 10 <S3: resample>   010
## # ... with 90 more rows
bootstrapped_survs <- lung %>%
  bootstrap(100) %>%
  mutate(mod = map(strap, ~ survfit(coxph(Surv(time, status) ~ age + sex, .)))) %>%
  unnest(map(mod, tidy))
ggplot(bootstrapped_survs, aes(time, estimate, group = .id)) +
  geom_line(alpha = .2)

bootstrapped_survs_summ <- bootstrapped_survs %>%
  group_by(time) %>%
  summarize(median = median(estimate),
            low = quantile(estimate, .025),
            high = quantile(estimate, .975))

bootstrapped_survs_summ
## # A tibble: 186 x 4
##     time    median       low      high
##    <dbl>     <dbl>     <dbl>     <dbl>
## 1      5 0.9938535 0.9871814 0.9960097
## 2     11 0.9832965 0.9664918 0.9958157
## 3     12 0.9800576 0.9601024 0.9923554
## 4     13 0.9713533 0.9535164 0.9912766
## 5     15 0.9664579 0.9424655 0.9874527
## 6     26 0.9628420 0.9365092 0.9770833
## 7     30 0.9582126 0.9332997 0.9788948
## 8     31 0.9537011 0.9266329 0.9747833
## 9     53 0.9487279 0.9144704 0.9695035
## 10    54 0.9408214 0.9115521 0.9690981
## # ... with 176 more rows
ggplot(bootstrapped_survs_summ, aes(time, median)) +
  geom_line() +
  geom_ribbon(aes(ymin = low, ymax = high), alpha = .2)

K-means clustering

Simulation setup:

library(dplyr)

set.seed(2014)
centers <- data_frame(oracle = factor(1:3),
                      size = c(100, 150, 50),
                      x1 = c(5, 0, -3),
                      x2 = c(-1, 1, -2))

original_data <- centers %>%
  group_by(oracle) %>%
  do(data.frame(x1 = rnorm(.$size[1], .$x1[1]),
                x2 = rnorm(.$size[1], .$x2[1]))) %>%
  ungroup() %>%
  select(-oracle)
original_data
## # A tibble: 300 x 2
##          x1         x2
##       <dbl>      <dbl>
## 1  4.434320  0.5416470
## 2  5.321046 -0.9412882
## 3  5.125271 -1.5802282
## 4  6.353225 -1.6040549
## 5  3.712270 -3.4079344
## 6  5.322555 -0.7716317
## 7  5.266928  0.5857579
## 8  5.399710 -0.7006604
## 9  5.458846 -1.8196312
## 10 7.150210 -0.5586259
## # ... with 290 more rows
library(ggplot2)
ggplot(original_data, aes(x1, x2)) +
  geom_point()

library(tidyr)
library(purrr)

m <- as.matrix(original_data)

k <- kmeans(m, 2)
kclusts <- data_frame(k = 1:9) %>%
  mutate(mod = map(k, ~ kmeans(m, .)))

centers <- kclusts %>%
  unnest(map(mod, tidy))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
points <- kclusts %>%
  unnest(map(mod, augment, data = m))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
ggplot(centers, aes(x1, x2)) +
  geom_point(aes(color = .cluster), data = points) +
  geom_point(shape = "x", size = 6) +
  facet_wrap(~ k)

models <- kclusts %>%
  unnest(map(mod, glance))

ggplot(models, aes(k, tot.withinss)) +
  geom_line()