Hadley uses the gapminder dataset to demonstrate the value of dataframes of nested dataframes/lists for comparing models. Each functions can be run on each dataframe using the map() function from the purrr package (basically Hadley’s implementation of lapply()).
As usual the talk is great. Check it out at YouTube.
by_country <- gapminder %>%
mutate(year1950 = year - 1950 ) %>% # dataset starts at 1950
group_by(continent, country) %>%
nest() # all columns is placed in a dataframe
# which is placed into a column "data"
country_model <- function(df){
lm(lifeExp ~ year1950, data = df)
}
models <- by_country %>%
mutate(model = map(data, country_model)) %>%
# model results are summarised in tidy dataframes using broom
mutate(glance = map(model, broom::glance),
rsq = glance %>% map_dbl("r.squared"),
tidy = map(model, broom::tidy),
augment= map(model, broom::augment))
models
## # A tibble: 142 × 8
## continent country data model glance
## <fctr> <fctr> <list> <list> <list>
## 1 Asia Afghanistan <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 2 Europe Albania <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 3 Africa Algeria <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 4 Africa Angola <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 5 Americas Argentina <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 6 Oceania Australia <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 7 Europe Austria <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 8 Asia Bahrain <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 9 Asia Bangladesh <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## 10 Europe Belgium <tibble [12 × 5]> <S3: lm> <data.frame [1 × 11]>
## # ... with 132 more rows, and 3 more variables: rsq <dbl>, tidy <list>,
## # augment <list>
ggplot(models, aes(x = rsq, y= reorder(country, rsq))) +
geom_point(aes(color = continent)) +
labs(y = "country") +
theme(axis.text.y = element_blank(),
axis.ticks.y= element_blank())
you can get the data out of each of the result dataframes using the
unnest() function.
models %>% unnest(glance)
## # A tibble: 142 × 18
## continent country data model rsq
## <fctr> <fctr> <list> <list> <dbl>
## 1 Asia Afghanistan <tibble [12 × 5]> <S3: lm> 0.9477123
## 2 Europe Albania <tibble [12 × 5]> <S3: lm> 0.9105778
## 3 Africa Algeria <tibble [12 × 5]> <S3: lm> 0.9851172
## 4 Africa Angola <tibble [12 × 5]> <S3: lm> 0.8878146
## 5 Americas Argentina <tibble [12 × 5]> <S3: lm> 0.9955681
## 6 Oceania Australia <tibble [12 × 5]> <S3: lm> 0.9796477
## 7 Europe Austria <tibble [12 × 5]> <S3: lm> 0.9921340
## 8 Asia Bahrain <tibble [12 × 5]> <S3: lm> 0.9667398
## 9 Asia Bangladesh <tibble [12 × 5]> <S3: lm> 0.9893609
## 10 Europe Belgium <tibble [12 × 5]> <S3: lm> 0.9945406
## # ... with 132 more rows, and 13 more variables: tidy <list>,
## # augment <list>, r.squared <dbl>, adj.r.squared <dbl>, sigma <dbl>,
## # statistic <dbl>, p.value <dbl>, df <int>, logLik <dbl>, AIC <dbl>,
## # BIC <dbl>, deviance <dbl>, df.residual <int>
models %>%
unnest(tidy) %>%
select(continent, country, term, estimate, rsq) %>%
spread(term, estimate) %>%
ggplot(aes(`(Intercept)`, year1950)) + # Life Expectancy in 1950 vs. slope ("yearlt increase")
geom_point(aes(color= continent, size = rsq))+
geom_smooth(se= FALSE) +
xlab("Life expectancy (1950)") +
ylab("Yearly Improvement") +
scale_size_area()
## `geom_smooth()` using method = 'loess'
models %>%
unnest(augment) %>%
ggplot(aes(year1950, .resid)) +
geom_line(aes(group= country), alpha = 1/3) +
geom_hline(yintercept = 0, color= "white", size= 2) +
geom_smooth(se= FALSE) +
facet_wrap(~ continent)
## `geom_smooth()` using method = 'loess'