Today we are going to fit a simple regression model with yearly population to year, by each country of the world. The data is available on the gapminder package. And we will use functional programming to apply regression model to each country with an easy 10 lines of code.
require(tidyverse)
require(gapminder)
head(gapminder)## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
gapminder %>%
ggplot(aes(year, pop, color = country)) +
geom_line(show.legend = F) +
facet_wrap(~continent) +
scale_y_log10() + # Took log to make the y axis readable
theme_light()We can clearly see some difference within and between continents. There is also a linear pattern in increase of population with year. Lets fit the regression model and check which countries and continent have highest yearly population growth rate.
require(broom)
require(purrr)
reg_models <- gapminder %>%
group_by(country, continent) %>%
nest() %>%
mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>%
mutate(coefs = map(models, ~tidy(.x)),
GOF = map(models, ~glance(.x))) %>%
unnest(coefs)
### FItted Models and Coefficients
reg_models ## # A tibble: 284 x 10
## # Groups: country, continent [142]
## country continent data models term estimate std.error statistic p.value
## <fct> <fct> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghani~ Asia <tibbl~ <lm> (Int~ -6.91e8 1.05e8 -6.55 6.48e- 5
## 2 Afghani~ Asia <tibbl~ <lm> year 3.57e5 5.33e4 6.70 5.37e- 5
## 3 Albania Europe <tibbl~ <lm> (Int~ -8.75e7 3.95e6 -22.1 7.90e-10
## 4 Albania Europe <tibbl~ <lm> year 4.55e4 2.00e3 22.8 5.94e-10
## 5 Algeria Africa <tibbl~ <lm> (Int~ -9.16e8 4.25e7 -21.5 1.04e- 9
## 6 Algeria Africa <tibbl~ <lm> year 4.73e5 2.15e4 22.0 8.39e-10
## 7 Angola Africa <tibbl~ <lm> (Int~ -2.78e8 2.11e7 -13.2 1.22e- 7
## 8 Angola Africa <tibbl~ <lm> year 1.44e5 1.07e4 13.5 9.51e- 8
## 9 Argenti~ Americas <tibbl~ <lm> (Int~ -7.99e8 1.52e7 -52.5 1.53e-13
## 10 Argenti~ Americas <tibbl~ <lm> year 4.18e5 7.69e3 54.3 1.08e-13
## # ... with 274 more rows, and 1 more variable: GOF <list>
g1 <- reg_models %>%
filter(term == "year") %>%
arrange(estimate) %>%
unnest(data) %>%
group_by(country, continent, estimate) %>%
nest() %>%
ggplot(aes(x = reorder(country, estimate), y = estimate, fill = continent)) +
geom_col() +
scale_y_log10() +
theme_light() +
theme(axis.text.x = element_text(size=3, angle = 90),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(title = "Estimated per year Population Growth Rate by Country",
x = "Country",
y = "Coefficient / Estimate on log_10 scale",
fill = "Continent")
plotly::ggplotly(g1)Looks like Asia, Africa and South American countries have the highest population growth rates.
require(ggrepel)
reg_models %>%
filter(term == "year") %>%
select(-std.error, -statistic, -p.value) %>% ### Overlaps with augment, so remove
unnest(GOF) %>%
mutate(`Inverse-R-Squared` = 1/ round(r.squared, 2)) %>%
ggplot(aes(x = estimate, y = p.value, color = continent)) +
geom_point(aes(size = `Inverse-R-Squared`)) +
guides(color = "none") + ### Hide Continents from Legend, But keep Size Legend
geom_hline(yintercept = 0.05,
linetype = "longdash",
alpha = 0.6) +
labs(shape="Inverse R Squared", colour="Continents",
x = "Coefficient Value (Yearly Growth Rate)",
y = "P-value of Coefficient",
title = "Coefficient, Pvalue of Coefficients, R-Squared",
subtitle = "Continents sorted by Median Population Growth Rate",
caption = "Both Axis on log scale"
) +
scale_y_log10() +
scale_x_log10() +
geom_text_repel(aes(label = country),
size = 2.2,
alpha = 0.5,
show.legend = F,
max.overlaps = 20,
force_pull = 0.7, max.time = 1,
min.segment.length = unit(0, 'lines'),
) +
### Sort continents by median coefficient
facet_wrap(~ fct_reorder(.f = as.factor(continent), .x = estimate, .fun = median)) +
theme_light() +
theme(legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical") ### First Group by Country and COntinent
gapminder %>%
group_by(country, continent)## # A tibble: 1,704 x 6
## # Groups: country, continent [142]
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # ... with 1,694 more rows
### Then nest the data
gapminder %>%
group_by(country, continent) %>%
nest() ## # A tibble: 142 x 3
## # Groups: country, continent [142]
## country continent data
## <fct> <fct> <list>
## 1 Afghanistan Asia <tibble [12 x 4]>
## 2 Albania Europe <tibble [12 x 4]>
## 3 Algeria Africa <tibble [12 x 4]>
## 4 Angola Africa <tibble [12 x 4]>
## 5 Argentina Americas <tibble [12 x 4]>
## 6 Australia Oceania <tibble [12 x 4]>
## 7 Austria Europe <tibble [12 x 4]>
## 8 Bahrain Asia <tibble [12 x 4]>
## 9 Bangladesh Asia <tibble [12 x 4]>
## 10 Belgium Europe <tibble [12 x 4]>
## # ... with 132 more rows
### Apply Linear Regression to the column data
### Inside the data, there are columns called pop and year
### .x means take everything from each tibble of the data
gapminder %>%
group_by(country, continent) %>%
nest() %>%
mutate(models = map(data, ~lm(pop ~ year, data = .x)))## # A tibble: 142 x 4
## # Groups: country, continent [142]
## country continent data models
## <fct> <fct> <list> <list>
## 1 Afghanistan Asia <tibble [12 x 4]> <lm>
## 2 Albania Europe <tibble [12 x 4]> <lm>
## 3 Algeria Africa <tibble [12 x 4]> <lm>
## 4 Angola Africa <tibble [12 x 4]> <lm>
## 5 Argentina Americas <tibble [12 x 4]> <lm>
## 6 Australia Oceania <tibble [12 x 4]> <lm>
## 7 Austria Europe <tibble [12 x 4]> <lm>
## 8 Bahrain Asia <tibble [12 x 4]> <lm>
## 9 Bangladesh Asia <tibble [12 x 4]> <lm>
## 10 Belgium Europe <tibble [12 x 4]> <lm>
## # ... with 132 more rows
### We created the model column
### Now we tidy the model column
### This created tibbles of coefficients for each country
gapminder %>%
group_by(country, continent) %>%
nest() %>%
mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>%
mutate(coefs = map(models, ~tidy(.x)))## # A tibble: 142 x 5
## # Groups: country, continent [142]
## country continent data models coefs
## <fct> <fct> <list> <list> <list>
## 1 Afghanistan Asia <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 2 Albania Europe <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 3 Algeria Africa <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 4 Angola Africa <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 5 Argentina Americas <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 6 Australia Oceania <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 7 Austria Europe <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 8 Bahrain Asia <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 9 Bangladesh Asia <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## 10 Belgium Europe <tibble [12 x 4]> <lm> <tibble [2 x 5]>
## # ... with 132 more rows
### Finally unnest the coefs column to get our estimates
gapminder %>%
group_by(country, continent) %>%
nest() %>%
mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>%
mutate(coefs = map(models, ~tidy(.x))) %>%
unnest(coefs) ## # A tibble: 284 x 9
## # Groups: country, continent [142]
## country continent data models term estimate std.error statistic p.value
## <fct> <fct> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghani~ Asia <tibbl~ <lm> (Int~ -6.91e8 1.05e8 -6.55 6.48e- 5
## 2 Afghani~ Asia <tibbl~ <lm> year 3.57e5 5.33e4 6.70 5.37e- 5
## 3 Albania Europe <tibbl~ <lm> (Int~ -8.75e7 3.95e6 -22.1 7.90e-10
## 4 Albania Europe <tibbl~ <lm> year 4.55e4 2.00e3 22.8 5.94e-10
## 5 Algeria Africa <tibbl~ <lm> (Int~ -9.16e8 4.25e7 -21.5 1.04e- 9
## 6 Algeria Africa <tibbl~ <lm> year 4.73e5 2.15e4 22.0 8.39e-10
## 7 Angola Africa <tibbl~ <lm> (Int~ -2.78e8 2.11e7 -13.2 1.22e- 7
## 8 Angola Africa <tibbl~ <lm> year 1.44e5 1.07e4 13.5 9.51e- 8
## 9 Argenti~ Americas <tibbl~ <lm> (Int~ -7.99e8 1.52e7 -52.5 1.53e-13
## 10 Argenti~ Americas <tibbl~ <lm> year 4.18e5 7.69e3 54.3 1.08e-13
## # ... with 274 more rows
### ALso unnest the GOF column to get our GOF estimates
gapminder %>%
group_by(country, continent) %>%
nest() %>%
mutate(models = map(data, ~lm(pop ~ year, data = .x))) %>%
mutate(coefs = map(models, ~tidy(.x)),
GOF = map(models, ~glance(.x))) %>%
unnest(GOF) ## # A tibble: 142 x 17
## # Groups: country, continent [142]
## country continent data models coefs r.squared adj.r.squared sigma statistic
## <fct> <fct> <lis> <list> <lis> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan~ Asia <tib~ <lm> <tib~ 0.818 0.800 3.19e6 44.9
## 2 Albania Europe <tib~ <lm> <tib~ 0.981 0.979 1.19e5 520.
## 3 Algeria Africa <tib~ <lm> <tib~ 0.980 0.978 1.28e6 485.
## 4 Angola Africa <tib~ <lm> <tib~ 0.948 0.943 6.39e5 183.
## 5 Argent~ Americas <tib~ <lm> <tib~ 0.997 0.996 4.60e5 2953.
## 6 Austra~ Oceania <tib~ <lm> <tib~ 0.999 0.999 9.90e4 17192.
## 7 Austria Europe <tib~ <lm> <tib~ 0.960 0.956 9.18e4 240.
## 8 Bahrain Asia <tib~ <lm> <tib~ 0.973 0.971 3.61e4 366.
## 9 Bangla~ Asia <tib~ <lm> <tib~ 0.983 0.981 4.82e6 561.
## 10 Belgium Europe <tib~ <lm> <tib~ 0.939 0.933 1.35e5 154.
## # ... with 132 more rows, and 8 more variables: p.value <dbl>, df <dbl>,
## # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>