This is an adaptation of the code I used in the Portland PDXR meetup.
library(survival)
head(lung)
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1 3 306 2 74 1 1 90 100 1175 NA
## 2 3 455 2 68 1 0 90 90 1225 15
## 3 3 1010 1 56 1 0 90 90 NA 15
## 4 5 210 2 57 1 1 90 60 1150 11
## 5 1 883 2 60 1 0 100 90 NA 0
## 6 12 1022 1 74 1 1 50 80 513 0
surv_fit <- survfit(coxph(Surv(time, status) ~ age + sex, lung))
surv_fit
## Call: survfit(formula = coxph(Surv(time, status) ~ age + sex, lung))
##
## n events median 0.95LCL 0.95UCL
## 228 165 320 285 363
library(broom)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
td <- tidy(surv_fit)
ggplot(td, aes(time, estimate)) +
geom_line() +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
alpha = .5) +
scale_y_continuous(labels = percent_format()) +
ylab("Predicted survival")
library(tidyr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:scales':
##
## discard
## The following objects are masked from 'package:dplyr':
##
## contains, order_by
library(broom)
ggplot(Orange, aes(age, circumference)) +
geom_point()
lm(circumference ~ age, Orange)
##
## Call:
## lm(formula = circumference ~ age, data = Orange)
##
## Coefficients:
## (Intercept) age
## 17.3997 0.1068
Tidying coefficients:
models <- Orange %>%
nest(-Tree) %>%
mutate(mod = map(data, ~ lm(circumference ~ age, data = .)))
coefficients <- models %>%
mutate(coefs = map(mod, tidy, conf.int = TRUE)) %>%
unnest(coefs)
coefficients %>%
filter(term == "age") %>%
ggplot(aes(estimate, Tree)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
xlab("Growth rate in mm/day")
Tidying residuals with augment:
obs <- models %>%
mutate(obs = map(mod, augment)) %>%
unnest(obs)
ggplot(obs, aes(age, .resid, color = Tree)) +
geom_point()
Recall the tidy survival curve:
library(modelr)
##
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
##
## bootstrap
library(purrr)
library(tidyr)
head(lung)
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1 3 306 2 74 1 1 90 100 1175 NA
## 2 3 455 2 68 1 0 90 90 1225 15
## 3 3 1010 1 56 1 0 90 90 NA 15
## 4 5 210 2 57 1 1 90 60 1150 11
## 5 1 883 2 60 1 0 100 90 NA 0
## 6 12 1022 1 74 1 1 50 80 513 0
surv_fit <- survfit(coxph(Surv(time, status) ~ age + sex, lung))
td <- tidy(surv_fit)
ggplot(td, aes(time, estimate)) +
geom_line() +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
alpha = .5) +
scale_y_continuous(labels = percent_format()) +
ylab("Predicted survival")
We can use the bootstrap function from the modelr package to perform this many times:
library(modelr)
lung %>%
bootstrap(100)
## # A tibble: 100 x 2
## strap .id
## <list> <chr>
## 1 <S3: resample> 001
## 2 <S3: resample> 002
## 3 <S3: resample> 003
## 4 <S3: resample> 004
## 5 <S3: resample> 005
## 6 <S3: resample> 006
## 7 <S3: resample> 007
## 8 <S3: resample> 008
## 9 <S3: resample> 009
## 10 <S3: resample> 010
## # ... with 90 more rows
bootstrapped_survs <- lung %>%
bootstrap(100) %>%
mutate(mod = map(strap, ~ survfit(coxph(Surv(time, status) ~ age + sex, .)))) %>%
unnest(map(mod, tidy))
ggplot(bootstrapped_survs, aes(time, estimate, group = .id)) +
geom_line(alpha = .2)
bootstrapped_survs_summ <- bootstrapped_survs %>%
group_by(time) %>%
summarize(median = median(estimate),
low = quantile(estimate, .025),
high = quantile(estimate, .975))
bootstrapped_survs_summ
## # A tibble: 186 x 4
## time median low high
## <dbl> <dbl> <dbl> <dbl>
## 1 5 0.9938535 0.9871814 0.9960097
## 2 11 0.9832965 0.9664918 0.9958157
## 3 12 0.9800576 0.9601024 0.9923554
## 4 13 0.9713533 0.9535164 0.9912766
## 5 15 0.9664579 0.9424655 0.9874527
## 6 26 0.9628420 0.9365092 0.9770833
## 7 30 0.9582126 0.9332997 0.9788948
## 8 31 0.9537011 0.9266329 0.9747833
## 9 53 0.9487279 0.9144704 0.9695035
## 10 54 0.9408214 0.9115521 0.9690981
## # ... with 176 more rows
ggplot(bootstrapped_survs_summ, aes(time, median)) +
geom_line() +
geom_ribbon(aes(ymin = low, ymax = high), alpha = .2)
Simulation setup:
library(dplyr)
set.seed(2014)
centers <- data_frame(oracle = factor(1:3),
size = c(100, 150, 50),
x1 = c(5, 0, -3),
x2 = c(-1, 1, -2))
original_data <- centers %>%
group_by(oracle) %>%
do(data.frame(x1 = rnorm(.$size[1], .$x1[1]),
x2 = rnorm(.$size[1], .$x2[1]))) %>%
ungroup() %>%
select(-oracle)
original_data
## # A tibble: 300 x 2
## x1 x2
## <dbl> <dbl>
## 1 4.434320 0.5416470
## 2 5.321046 -0.9412882
## 3 5.125271 -1.5802282
## 4 6.353225 -1.6040549
## 5 3.712270 -3.4079344
## 6 5.322555 -0.7716317
## 7 5.266928 0.5857579
## 8 5.399710 -0.7006604
## 9 5.458846 -1.8196312
## 10 7.150210 -0.5586259
## # ... with 290 more rows
library(ggplot2)
ggplot(original_data, aes(x1, x2)) +
geom_point()
library(tidyr)
library(purrr)
m <- as.matrix(original_data)
k <- kmeans(m, 2)
kclusts <- data_frame(k = 1:9) %>%
mutate(mod = map(k, ~ kmeans(m, .)))
centers <- kclusts %>%
unnest(map(mod, tidy))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
points <- kclusts %>%
unnest(map(mod, augment, data = m))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
ggplot(centers, aes(x1, x2)) +
geom_point(aes(color = .cluster), data = points) +
geom_point(shape = "x", size = 6) +
facet_wrap(~ k)
models <- kclusts %>%
unnest(map(mod, glance))
ggplot(models, aes(k, tot.withinss)) +
geom_line()