Harold Nelson
10/24/2018
library(tidyverse)
## ── Attaching packages ── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(socviz)
library(gapminder)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## rivers
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Healy creates this model to demonstrate several things.
out <- lm(formula = lifeExp ~ gdpPercap + pop + continent,
data = gapminder)
summary(out)
##
## Call:
## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.161 -4.486 0.297 5.110 25.175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.781e+01 3.395e-01 140.819 < 2e-16 ***
## gdpPercap 4.495e-04 2.346e-05 19.158 < 2e-16 ***
## pop 6.570e-09 1.975e-09 3.326 0.000901 ***
## continentAmericas 1.348e+01 6.000e-01 22.458 < 2e-16 ***
## continentAsia 8.193e+00 5.712e-01 14.342 < 2e-16 ***
## continentEurope 1.747e+01 6.246e-01 27.973 < 2e-16 ***
## continentOceania 1.808e+01 1.782e+00 10.146 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.365 on 1697 degrees of freedom
## Multiple R-squared: 0.5821, Adjusted R-squared: 0.5806
## F-statistic: 393.9 on 6 and 1697 DF, p-value: < 2.2e-16
Healy creates the following grid of values to use with the predict function. Note that this involves values that were not in the original dataset. The median value of population is global, not specific to a continent or year.
min_gdp <- min(gapminder$gdpPercap)
max_gdp <- max(gapminder$gdpPercap)
med_pop <- median(gapminder$pop)
pred_df <- expand.grid(gdpPercap = (seq(from = min_gdp,
to = max_gdp,
length.out = 100)),
pop = med_pop,
continent = c("Africa", "Americas",
"Asia", "Europe", "Oceania"))
Now create a predicted value for every point in the grid.
pred_out <- predict(object = out,
newdata = pred_df,
interval = "predict")
head(pred_out)
## fit lwr upr
## 1 47.96863 31.54775 64.38951
## 2 48.48298 32.06231 64.90365
## 3 48.99733 32.57670 65.41797
## 4 49.51169 33.09092 65.93245
## 5 50.02604 33.60497 66.44711
## 6 50.54039 34.11885 66.96193
Next, combine the predicted values with the corresponding points from the grid.
pred_df <- cbind(pred_df, pred_out)
head(pred_df)
## gdpPercap pop continent fit lwr upr
## 1 241.1659 7023596 Africa 47.96863 31.54775 64.38951
## 2 1385.4282 7023596 Africa 48.48298 32.06231 64.90365
## 3 2529.6905 7023596 Africa 48.99733 32.57670 65.41797
## 4 3673.9528 7023596 Africa 49.51169 33.09092 65.93245
## 5 4818.2150 7023596 Africa 50.02604 33.60497 66.44711
## 6 5962.4773 7023596 Africa 50.54039 34.11885 66.96193
Now graph the results of predictions from the grid and add the raw data. Note that I have used filter from the tidyverse instead of subset from base R.
p <- ggplot(data = filter(pred_df, continent %in% c("Europe", "Africa")),
aes(x = gdpPercap,
y = fit, ymin = lwr, ymax = upr,
color = continent,
fill = continent,
group = continent))
p + geom_point(data = filter(gapminder,
continent %in% c("Europe", "Africa")),
aes(x = gdpPercap, y = lifeExp,
color = continent),
alpha = 0.5,
inherit.aes = FALSE) +
geom_line() +
geom_ribbon(alpha = 0.2, color = FALSE) +
scale_x_log10(labels = scales::dollar)
Here is a graphic showing all of the continents using facet_wrap. Using color to dsiplay year shows how much life expectancy has increased over time irrespective of perCapita GDP
p <- ggplot(data = pred_df,
aes(x = gdpPercap,
y = fit, ymin = lwr, ymax = upr,
group = continent))
p + geom_point(data = gapminder,
aes(x = gdpPercap, y = lifeExp,color=year),
alpha = 0.5,
inherit.aes = FALSE) +
geom_line() +
scale_x_log10(labels = scales::dollar) +
facet_wrap(~continent)