rm(list = ls())
pacman::p_load(
tidyverse
)
dta_e0 <- read_csv("hmd_explorer/data/hmd_e0.csv")
## Parsed with column specification:
## cols(
## code = col_character(),
## year = col_double(),
## gender = col_character(),
## e0 = col_double()
## )
We also want to take the estimates for Scotland for 2017 from the NRS.
data_nrs <- tribble(
~code, ~year, ~gender, ~e0,
"GBR_SCO", 2017, "female", 81.15,
"GBR_SCO", 2017, "male", 77.14
)
dta_e0 <- bind_rows(
dta_e0,
data_nrs
)
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015) %>%
mutate(change_e0_months = change_e0 * 12)
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
ggplot(aes(x = year, y = change_e0)) +
geom_line() + geom_point() +
facet_grid(gender ~ .) +
geom_hline(yintercept = 0) +
scale_x_continuous(breaks = seq(1860, 2010, by = 10)) +
annotate("rect", xmin = 1955, xmax = 2014, ymin = -Inf, ymax = Inf, alpha = 0.2) +
labs(
title = "Annual changes in life expectancy over time in Scotland, 1855-2017",
subtitle = "Highlighted region: 1955-2014. Highlighted point: 2015",
caption = "Source: HMD, except 2017: courtesy of NRS",
x = "Year",
y = "Change in life expectancy from previous year in years"
) +
geom_point(
aes(x = year, y = change_e0),
size = 2, shape = 15, colour = "red",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
)
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
filter(year >= 1955) %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
ggplot(aes(x = year, y = change_e0)) +
geom_line() + geom_point() +
facet_grid(gender ~ .) +
geom_hline(yintercept = 0) +
scale_x_continuous(breaks = seq(1860, 2010, by = 10)) +
labs(
title = "Annual changes in life expectancy over time in Scotland, 1955-2017",
subtitle = "Highlighted region: 1955-2014. Highlighted point: 2015",
caption = "Source: HMD, except 2017: courtesy of NRS",
x = "Year",
y = "Change in life expectancy from previous year in years"
) +
geom_point(
aes(x = year, y = change_e0),
size = 2, shape = 15, colour = "red",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
)
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
ggplot(aes(x = change_e0)) +
stat_density(alpha = 0.2) +
geom_rug() +
facet_grid(gender ~ .) +
geom_vline(xintercept = 0, size = 1.5) +
geom_vline(
aes(xintercept =change_e0),
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015),
colour = "red",
linetype = "dashed"
) +
geom_vline(
aes(xintercept =mean_change_e0),
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0)) %>%
ungroup(),
colour = "blue"
) +
labs(
title = "Range of falls and gains in annual life expectancy in Scotland, 1955-2014",
x = "Change in life expectancy from previous year in years",
y = "Density of distribution of changes"
) +
geom_text(
aes(x = mean_change_e0), y = 0.75,
label = "Average\nChange", fontface = "bold", colour = "white",
nudge_y = 0.01, nudge_x = 0.0025,
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0)) %>%
ungroup()
) +
geom_text(
aes(x = mean_change_e0), y = 0.75,
label = "Average\nChange", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0)) %>%
ungroup()
) +
geom_text(
aes(x = change_e0), y = 0.75,
label = "Change from\n2014-2015", fontface = "bold", colour = "white",
nudge_y = 0.01, nudge_x = 0.0025,
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0), y = 0.75,
label = "Change from\n2014-2015", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0), y = 0.75,
label = "Change from\n2014-2015", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0, label = year), y = 0.1,
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
mutate(rank_ch = rank(change_e0)) %>%
filter(rank_ch <= 2 | rank_ch >= 59)
)
The table below shows the mean and standard deviation of the annual changes that occurred between 1955-2014. The average (mean) annual improvement was 0.18 years per year for females and 0.20 years per year for males, with a standard deviation of 0.31 for females and 0.27 for males. These values are used to estimate the probability, and so expected frequency, of the 2014-15 decline, using a normal distribution. This approach suggestes there is a 6.0% change of an event as or more severe than the 2014-15 decline for females, and a 1.7% probability of observing an event as or more severe than the 2014-15 decline for males. This suggests the 2014-15 fall was a 1-in-17 year event for females, and a 1-in-60 year event for males.
summary_stats <- dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(
mean_ch_e0 = mean(change_e0),
sd_ch_e0 = sd(change_e0)
) %>%
ungroup()
summary_stats
summary_stats %>%
left_join(
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015) %>%
select(gender, change_in_2015 = change_e0)
) %>%
mutate(
prob_2015_chance = pmap_dbl(
list(
q = change_in_2015,
mean = mean_ch_e0,
sd = sd_ch_e0
),
.f = pnorm
),
one_in = 1/ prob_2015_chance,
prob_no_improvovement = pmap_dbl(
list(
q = c(0,0),
mean = mean_ch_e0,
sd = sd_ch_e0
),
.f = pnorm
)
)
## Joining, by = "gender"
The model above assumes that the change in life expectancy in any year is not dependent on that in the previous year. Put another way, the model is ‘memoryless’, and has not predictor terms (i.e. ‘right hand’ terms in regression formula). The next simplest model approach would be to build a model that includes the previous year’s change as a predictor variable. So, a one year lag term.
The prior hypothesis - consistent with ‘harvesting’ - is that the change in one year will be negatively correlated with the previous year.
This can be done by building a linear regression model with the previous year as a predictor variable
mdls <- dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(last_change_e0 = lag(change_e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
nest() %>%
mutate(mdl_nolag = map(data, ~lm(change_e0 ~ 1, data = .))) %>%
mutate(mdl_lag = map(data, ~lm(change_e0 ~ last_change_e0, data = .))) %>%
mutate(anova = map2(mdl_nolag, mdl_lag, anova)) %>%
mutate(p_autocorrelation = map_dbl(anova, ~.[2,6])) %>%
mutate(model_lag_tidy = map(mdl_lag, broom::tidy))
lag_mdl_summary <- mdls %>%
select(gender, model_lag_tidy) %>%
unnest()
p_autocorr_female <- mdls %>% filter(gender == "female") %>% pull("p_autocorrelation")
p_autocorr_male <- mdls %>% filter(gender == "male") %>% pull("p_autocorrelation")
Two model specifications have been built.
mdl_nolag is effectively the model specification used above, which has no predictor terms.mdl_lag includes the previous year’s change as a predictor variable.As mdl_nolag is the same as mdl_lag with one of its terms set to zero, mdl_nolag is a restricted version of mdl_lag, and so ANOVA can be used to test whether the move from the unrestricted model (mdl_nolag) to the restricted (mdl_lag) is justified in terms of improved fit to the data. In effect, the ANOVA test gives us an estimated probability that the data contains autocorrelation. These probabilities are 1.410^{-4} for females, and 0.02706 for males, and so provide strong evidence that autocorrelation exists in the data.
The specific type of autocorrelation of interest is negative autocorrelation. Negative autocorrelation would be present if the estimate column for the term last_change_e0 were negative. The coefficient of this term is statistically significant at p < 0.05 if the corresponding row in the column p.value is below 0.05. We can see from the table below that both of these conditions are true, though the magnitude of the negative autocorrelation is larger for females than males, and the associated p value for females much smaller than for males.
lag_mdl_summary
One conclusion we might draw from this evidence that the data contains negative autocorrelation is that the most troubling aspect of changes in life expectancy in Scotland since 2014 is not necessarily the fall in life expectancy from 2014-15, but the lack of any notable recovery in life expectancy increases from 2015-16. Negative autocorrelation would imply that a year showing a substantial fall in life expectancy is more likely to be followed by a substantial improvement in life expectancy, above the long-term average improvement, the following year. Instead, life expectancy improvement in 2015-16 was close to zero.
This supplementary analysis will explore the effect of using all available years for Scotland, rather than 1955-2014. This should be useful for avoiding any accusation of ‘cherry picking’ the selection of years.
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1855, 2014)) %>%
ggplot(aes(x = change_e0)) +
stat_density(alpha = 0.2) +
geom_rug() +
facet_grid(gender ~ .) +
geom_vline(xintercept = 0, size = 1.5) +
geom_vline(
aes(xintercept =change_e0),
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015),
colour = "red",
linetype = "dashed"
) +
geom_vline(
aes(xintercept =mean_change_e0),
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1855, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0, na.rm = T)) %>%
ungroup(),
colour = "blue"
) +
labs(
title = "Range of falls and gains in annual life expectancy in Scotland, 1855-2014",
x = "Change in life expectancy from previous year in years",
y = "Density of distribution of changes"
) +
geom_text(
aes(x = mean_change_e0), y = 0.15,
label = "Average\nChange", fontface = "bold", colour = "white",
nudge_y = 0.01, nudge_x = 0.0025,
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1855, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0, na.rm = T)) %>%
ungroup()
) +
geom_text(
aes(x = mean_change_e0), y = 0.15,
label = "Average\nChange", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1855, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0, na.rm = T)) %>%
ungroup()
) +
geom_text(
aes(x = change_e0), y = 0.45,
label = "Change from\n2014-2015", fontface = "bold", colour = "white",
nudge_y = 0.01, nudge_x = 0.0025,
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0), y = 0.45,
label = "Change from\n2014-2015", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0), y = 0.45,
label = "Change from\n2014-2015", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0, label = year), y = 0.1,
data = dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1855, 2014)) %>%
mutate(rank_ch = rank(change_e0)) %>%
filter(rank_ch <= 2 | rank_ch >= 158)
)
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_text).
It’s clear that there used to be much more variability in annual changes before the Second World War than after, and that a Normal distribution is much less appropriate. However if summarising the data in this way the mean and sd are as follows:
summary_stats_full <- dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1855, 2014)) %>%
group_by(gender) %>%
summarise(
mean_ch_e0 = mean(change_e0, na.rm = T),
sd_ch_e0 = sd(change_e0, na.rm = T)
) %>%
ungroup()
summary_stats_full
The mean increases were a bit higher, but the SDs were much, much higher.
If making the assumption that the kind of extreme variability in annual changes observed in the 19th century and first half of the 20th century should still be considered relevant, then the probability of seeing a fall as or more severe than seen in 2014-15 falls to one-in-3. (See below)
summary_stats_full %>%
left_join(
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015) %>%
select(gender, change_in_2015 = change_e0)
) %>%
mutate(
prob_2015_chance = pmap_dbl(
list(
q = change_in_2015,
mean = mean_ch_e0,
sd = sd_ch_e0
),
.f = pnorm
),
one_in = 1/ prob_2015_chance,
prob_no_improvovement = pmap_dbl(
list(
q = c(0,0),
mean = mean_ch_e0,
sd = sd_ch_e0
),
.f = pnorm
)
)
## Joining, by = "gender"
dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
ggplot(aes(x = year, y = change_e0)) +
geom_line() + geom_point() +
facet_grid(gender ~ .) +
geom_hline(yintercept = 0) +
scale_x_continuous(breaks = seq(1840, 2010, by = 10)) +
annotate("rect", xmin = 1955, xmax = 2014, ymin = -Inf, ymax = Inf, alpha = 0.2) +
labs(
title = "Annual changes in life expectancy over time in England & Wales, 1841-2016",
subtitle = "Highlighted region: 1955-2014. Highlighted point: 2015",
caption = "Source: HMD",
x = "Year",
y = "Change in life expectancy from previous year in years"
) +
geom_point(
aes(x = year, y = change_e0),
size = 2, shape = 15, colour = "red",
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
)
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
The range of variability is much higher, due to much greater exposure to the 1918 influenza pandemic/end of First World War. Let’s now look again just at the range starting 1955
dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(year >= 1955) %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
ggplot(aes(x = year, y = change_e0)) +
geom_line() + geom_point() +
facet_grid(gender ~ .) +
geom_hline(yintercept = 0) +
scale_x_continuous(breaks = seq(1860, 2010, by = 10)) +
labs(
title = "Annual changes in life expectancy over time in England & Wales, 1955-2016",
subtitle = "Highlighted point: 2015",
caption = "Source: HMD",
x = "Year",
y = "Change in life expectancy from previous year in years"
) +
geom_point(
aes(x = year, y = change_e0),
size = 2, shape = 15, colour = "red",
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
)
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
Now to show the distribution in comparison to 2014-15
dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
ggplot(aes(x = change_e0)) +
stat_density(alpha = 0.2) +
geom_rug() +
facet_grid(gender ~ .) +
geom_vline(xintercept = 0, size = 1.5) +
geom_vline(
aes(xintercept =change_e0),
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015),
colour = "red",
linetype = "dashed"
) +
geom_vline(
aes(xintercept =mean_change_e0),
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0)) %>%
ungroup(),
colour = "blue"
) +
labs(
title = "Range of falls and gains in annual life expectancy in England & Wales, 1955-2014",
x = "Change in life expectancy from previous year in years",
y = "Density of distribution of changes"
) +
geom_text(
aes(x = mean_change_e0), y = 0.75,
label = "Average\nChange", fontface = "bold", colour = "white",
nudge_y = 0.01, nudge_x = 0.0025,
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0)) %>%
ungroup()
) +
geom_text(
aes(x = mean_change_e0), y = 0.75,
label = "Average\nChange", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(mean_change_e0 = mean(change_e0)) %>%
ungroup()
) +
geom_text(
aes(x = change_e0), y = 0.75,
label = "Change from\n2014-2015", fontface = "bold", colour = "white",
nudge_y = 0.01, nudge_x = 0.0025,
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0), y = 0.75,
label = "Change from\n2014-2015", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0), y = 0.75,
label = "Change from\n2014-2015", fontface = "bold",
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
) +
geom_text(
aes(x = change_e0, label = year), y = 0.1,
data = dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
mutate(rank_ch = rank(change_e0)) %>%
filter(rank_ch <= 2 | rank_ch >= 59)
)
The 2014-15 fall looks to be a comparably rarer event for England & Wales than for Scotland. Let’s quantify this
summary_stats <- dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(gender) %>%
summarise(
mean_ch_e0 = mean(change_e0),
sd_ch_e0 = sd(change_e0)
) %>%
ungroup()
summary_stats
And now the estimated probability/frequency
summary_stats %>%
left_join(
dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015) %>%
select(gender, change_in_2015 = change_e0)
) %>%
mutate(
prob_2015_chance = pmap_dbl(
list(
q = change_in_2015,
mean = mean_ch_e0,
sd = sd_ch_e0
),
.f = pnorm
),
one_in = 1/ prob_2015_chance,
prob_no_improvovement = pmap_dbl(
list(
q = c(0,0),
mean = mean_ch_e0,
sd = sd_ch_e0
),
.f = pnorm
)
)
## Joining, by = "gender"
The initial visual impression, based largely on just counting the ticks to the left of the red dashed line, was a bit misleading.
The 2014-15 fall was a 1-in-27 year event (cf 1-in-16 for Scotland) for males, and a 1-in-42 year event (cf 1-in-60) for females.
This section will visually look at autocorrelation of 1 and 2 lags in Scotland, and England & Wales
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(lag_1 = lag(change_e0)) %>%
mutate(lag_2 = lag(change_e0, 2)) %>%
filter(between(year, 1955, 2014)) %>%
gather(key = "lag", value = "value", lag_1, lag_2) %>%
group_by(gender) %>%
ggplot(aes(x = change_e0, y = value)) +
geom_point() +
geom_smooth(method = "lm") +
facet_grid(lag ~ gender)
As might be expected, the autocorrelation with the previous year (lag_1) is stronger than the autocorrelation with two years previously (lag_2).
Let’s quantify this
dta_e0 %>%
filter(code == "GBR_SCO") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(lag_1 = lag(change_e0)) %>%
mutate(lag_2 = lag(change_e0, 2)) %>%
filter(between(year, 1955, 2014)) %>%
gather(key = "lag", value = "value", lag_1, lag_2) %>%
group_by(gender, lag) %>%
nest() %>%
mutate(corr = map_dbl(data, ~ cor(.x[,c("change_e0", "value")]) %>% .[2,1])) %>%
select(-data) %>%
spread(lag, corr)
So, the change over a year is autocorrelated with the previous year’s change, and more weakly for two years’ prior.
The autocorrelation to the previous year’s change is -0.47 for females and -0.29 for males.
And now for England & Wales
dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(lag_1 = lag(change_e0)) %>%
mutate(lag_2 = lag(change_e0, 2)) %>%
filter(between(year, 1955, 2014)) %>%
gather(key = "lag", value = "value", lag_1, lag_2) %>%
group_by(gender) %>%
ggplot(aes(x = change_e0, y = value)) +
geom_point() +
geom_smooth(method = "lm") +
facet_grid(lag ~ gender)
Lag-2 autocorrelation looks even weaker for England & Wales than for Scotland. Again, let’s quantify this
dta_e0 %>%
filter(code == "GBRCENW") %>%
filter(gender != "total") %>%
group_by(gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(lag_1 = lag(change_e0)) %>%
mutate(lag_2 = lag(change_e0, 2)) %>%
filter(between(year, 1955, 2014)) %>%
gather(key = "lag", value = "value", lag_1, lag_2) %>%
group_by(gender, lag) %>%
nest() %>%
mutate(corr = map_dbl(data, ~ cor(.x[,c("change_e0", "value")]) %>% .[2,1])) %>%
select(-data) %>%
spread(lag, corr)
Pretty much no lag-2 autocorrelation.
But a very similar magnitude of lag-1 autocorrelation by sex as for Scotland.
The aim of this section is to produce a dataset that can be visualised in Excel. This dataset will show the change in e0 since 1955 for each country in the HMD.
Each column will be a different gender/country combination
tbl <- dta_e0 %>%
filter(gender != "total") %>%
group_by(code, gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year >= 1955) %>%
unite(sex_country, code, gender) %>%
select(sex_country, year, change_e0) %>%
spread(sex_country, change_e0, fill = NA_real_)
write_csv(tbl, path = "support/changes_since_1955.csv")
And now, to visualise for all countries
g <- dta_e0 %>%
filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>%
filter(gender != "total") %>%
group_by(code, gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year >= 1955) %>%
ggplot(aes(x = year, y = change_e0)) +
facet_grid(code~gender) +
geom_point() + geom_line() +
geom_hline(yintercept = 0) +
labs(
title = "Annual changes in life expectancy over time, 1955-2016, all countries",
subtitle = "Highlighted point: 2015",
caption = "Source: HMD",
x = "Year",
y = "Change in life expectancy from previous year in years"
) +
geom_point(
aes(x = year, y = change_e0),
size = 2, shape = 15, colour = "red",
data = dta_e0 %>%
filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>%
filter(gender != "total") %>%
group_by(code, gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
filter(year == 2015)
)
print(g)
## Warning: Removed 36 rows containing missing values (geom_point).
ggsave("support/all_countries_change_since_1955.png", height = 200, width = 25, units = "cm", dpi = 300, limitsize = FALSE)
## Warning: Removed 36 rows containing missing values (geom_point).
The aim of this section is to generalise the analyses shown earlier for Scotland, and England & Wales, to all HMD countries
dta_e0 %>%
filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>%
filter(gender != "total") %>%
group_by(code, gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(lag_1 = lag(change_e0)) %>%
mutate(lag_2 = lag(change_e0, 2)) %>%
filter(between(year, 1955, 2014)) %>%
group_by(code, gender) %>%
nest() %>%
mutate(
m0 = map(data, ~lm(change_e0 ~ 1, data = .)),
m1 = map(data, ~lm(change_e0 ~ lag_1, data = .)),
m2 = map(data, ~lm(change_e0 ~ lag_1 + lag_2, data = .))
) %>%
select(-data) %>%
gather(key = "lag", value = "model", m0:m2) %>%
mutate(
aic = map_dbl(model, AIC),
bic = map_dbl(model, BIC)
) %>%
mutate(lag = substr(lag, 2,2) %>% as.integer) %>%
ggplot(aes(x = lag, y = bic, group = gender, colour = gender, shape = gender)) +
facet_wrap(~code, scales = "free_y") +
geom_point() + geom_line()
This indicates that a simple ‘oscillatory’ pattern of annual changes - a bad year predicting a good year and vice versa - is far from a universal pattern, and often differs significantly between genders even in the same country. Often, but not always, more terms lead to a better penalised model fit (using BIC, which tends to be more conservative than AIC).
Let’s now explore simply the magnitude and direction of lag 1 negative autocorrelation in the data
dta_e0 %>%
filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>%
filter(gender != "total") %>%
group_by(code, gender) %>%
arrange(year) %>%
mutate(change_e0 = e0 - lag(e0)) %>%
mutate(lag_1 = lag(change_e0)) %>%
mutate(lag_2 = lag(change_e0, 2)) %>%
filter(between(year, 1955, 2014)) %>%
gather(key = "lag", value = "value", lag_1, lag_2) %>%
group_by(code, gender, lag) %>%
nest() %>%
mutate(corr = map_dbl(data, ~ cor(.x[,c("change_e0", "value")]) %>% .[2,1])) %>%
select(-data) %>%
spread(lag, corr) %>%
filter(!is.na(lag_1)) %>%
ggplot(aes(x = lag_1, y = reorder(code, lag_1), shape = gender, colour = gender)) +
geom_point() +
geom_vline(xintercept = 0) +
labs(
x = "Correlation with previous year",
y = "Country",
title = "Autocorrelation between annual changes in life expectancy and change in previous year",
subtitle = "Period: 1955-2014 or nearest available range of years",
caption = "Source: HMD"
)
This is important for establishing a general principle: Though a 1 lag model does not universally outperform 0 or 2 lag models, it is generally the case that a negative autocorrelation exists between changes in one year and changes in the following year. i.e. it is generally correct to assume that life expectancy changes ‘oscillate’ rather than are a pure random walk.