rm(list = ls())

pacman::p_load(
  tidyverse
)

dta_e0 <- read_csv("hmd_explorer/data/hmd_e0.csv")
## Parsed with column specification:
## cols(
##   code = col_character(),
##   year = col_double(),
##   gender = col_character(),
##   e0 = col_double()
## )

We also want to take the estimates for Scotland for 2017 from the NRS.

data_nrs <- tribble(
  ~code, ~year, ~gender, ~e0,
  "GBR_SCO", 2017, "female", 81.15,
  "GBR_SCO", 2017, "male", 77.14
)

dta_e0 <- bind_rows(
  dta_e0,
  data_nrs
)

Introduction

dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(year == 2015) %>% 
  mutate(change_e0_months = change_e0 * 12)

Methods

Results

Figure 1: Long term trend of change

  • The figure below shows annual change in life expectancy in Scotland over time by sex. The point marking the 2014-15 fall is highlighted with a red point. Throughout the latter half of 19th century, and first half of the 20th century, there were much greater swings in annual changes in life expectancy than in the latter half of the 20th century and start of 21st century. Because high annual variability has not been observed for many decades, only data from 1955-2014 (highlighted grey) are used in subsequent analyses. [Though results with all data could be included in an appendix?] We can see from this figure that, though the 2014-15 fall in life expectancy is rare, it is not unprecedented in the 1955-2014 period.
  • Some summary numbers?
dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  ggplot(aes(x = year, y = change_e0)) + 
  geom_line() + geom_point() +
  facet_grid(gender ~ .) + 
  geom_hline(yintercept = 0) + 
  scale_x_continuous(breaks = seq(1860, 2010, by = 10)) + 
  annotate("rect", xmin = 1955, xmax = 2014, ymin = -Inf, ymax = Inf, alpha = 0.2) + 
  labs(
    title = "Annual changes in life expectancy over time in Scotland, 1855-2017",
    subtitle = "Highlighted region: 1955-2014. Highlighted point: 2015",
    caption = "Source: HMD, except 2017: courtesy of NRS",
    x = "Year", 
    y = "Change in life expectancy from previous year in years"
  ) + 
  geom_point(
    aes(x = year, y = change_e0),
    size = 2, shape = 15, colour = "red",
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  )
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

Figure 1b: Change from 1955

dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  filter(year >= 1955) %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  ggplot(aes(x = year, y = change_e0)) + 
  geom_line() + geom_point() +
  facet_grid(gender ~ .) + 
  geom_hline(yintercept = 0) + 
  scale_x_continuous(breaks = seq(1860, 2010, by = 10)) + 
  labs(
    title = "Annual changes in life expectancy over time in Scotland, 1955-2017",
    subtitle = "Highlighted region: 1955-2014. Highlighted point: 2015",
    caption = "Source: HMD, except 2017: courtesy of NRS",
    x = "Year", 
    y = "Change in life expectancy from previous year in years"
  ) + 
  geom_point(
    aes(x = year, y = change_e0),
    size = 2, shape = 15, colour = "red",
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  )
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

Figure 2: Distribution

  • Figure 2 below shows the distribution of annual changes in life expectancy over the 1955-2014 period. Each observation is represented as a vertical ‘tick’, and the years corresponding to the two largest annual increases and declines are labelled. The black vertical line indicates no change from the previous year, the thinner blue vertical line shows the average annual change over the period, and the dashed red line indicates the change observed in 2014-15. The grey shape above the points shows the density of the distribution.
  • We can see from this that for males, only one year, 1967-68, has seen a sharper annual fall in life expectancy than the change in 2014-15. We can also see that 1967-68 was preceeded by an especially high rate of improvement the previous year, 1966-67. Over the 60 year period, only one year saw a faster fall in life expectancy for males, whereas for females four years saw greater annual declines than occurred in 2014-15.
dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(between(year, 1955, 2014)) %>% 
  ggplot(aes(x = change_e0)) + 
  stat_density(alpha = 0.2) +
  geom_rug() + 
  facet_grid(gender ~ .) +
  geom_vline(xintercept = 0, size = 1.5) +
  geom_vline(
    aes(xintercept =change_e0),
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015),
    colour = "red",
    linetype = "dashed"
  ) +
  geom_vline(
    aes(xintercept =mean_change_e0),
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(between(year, 1955, 2014)) %>% 
      group_by(gender) %>% 
      summarise(mean_change_e0 = mean(change_e0)) %>% 
      ungroup(),
    colour = "blue"
  ) + 
  labs(
    title = "Range of falls and gains in annual life expectancy in Scotland, 1955-2014",
    x = "Change in life expectancy from previous year in years",
    y = "Density of distribution of changes"
  ) +
  geom_text(
    aes(x = mean_change_e0), y = 0.75, 
    label = "Average\nChange", fontface = "bold", colour = "white",
    nudge_y = 0.01, nudge_x = 0.0025,
    data = dta_e0 %>% 
          filter(code == "GBR_SCO") %>% 
          filter(gender != "total") %>% 
          group_by(gender) %>% 
          arrange(year) %>% 
          mutate(change_e0 = e0 - lag(e0)) %>% 
          filter(between(year, 1955, 2014)) %>% 
          group_by(gender) %>% 
          summarise(mean_change_e0 = mean(change_e0)) %>% 
          ungroup()
  ) +
  geom_text(
    aes(x = mean_change_e0), y = 0.75, 
    label = "Average\nChange", fontface = "bold",
    data = dta_e0 %>% 
          filter(code == "GBR_SCO") %>% 
          filter(gender != "total") %>% 
          group_by(gender) %>% 
          arrange(year) %>% 
          mutate(change_e0 = e0 - lag(e0)) %>% 
          filter(between(year, 1955, 2014)) %>% 
          group_by(gender) %>% 
          summarise(mean_change_e0 = mean(change_e0)) %>% 
          ungroup()
            ) +
  geom_text(
    aes(x = change_e0), y = 0.75, 
    label = "Change from\n2014-2015", fontface = "bold", colour = "white",
    nudge_y = 0.01, nudge_x = 0.0025,
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0), y = 0.75, 
    label = "Change from\n2014-2015", fontface = "bold",
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0), y = 0.75, 
    label = "Change from\n2014-2015", fontface = "bold",
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0, label = year), y = 0.1,
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(between(year, 1955, 2014)) %>% 
      mutate(rank_ch = rank(change_e0)) %>% 
      filter(rank_ch <= 2 | rank_ch >= 59)
  )

Modelling

How rare was the 2014-15 change? The simple approach

  • As there was one event as or more severe than the 2014-15 fall for males, and four events as more severe than the 2014-15 fall for females, and there were 60 observations for each sex, the simple estimates for the 2014-15 fall are 1/60 (around 1.7%) for males and 4/60 (around 6.7%) for females.
  • If needed, credible intervals for the above could be produced through bootstrapping. (i.e. repeatedly resampling from the same data).

How rare was the 2014-15 change? A slightly more complicated approach

The table below shows the mean and standard deviation of the annual changes that occurred between 1955-2014. The average (mean) annual improvement was 0.18 years per year for females and 0.20 years per year for males, with a standard deviation of 0.31 for females and 0.27 for males. These values are used to estimate the probability, and so expected frequency, of the 2014-15 decline, using a normal distribution. This approach suggestes there is a 6.0% change of an event as or more severe than the 2014-15 decline for females, and a 1.7% probability of observing an event as or more severe than the 2014-15 decline for males. This suggests the 2014-15 fall was a 1-in-17 year event for females, and a 1-in-60 year event for males.

summary_stats <- dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(between(year, 1955, 2014)) %>% 
  group_by(gender) %>% 
  summarise(
    mean_ch_e0 = mean(change_e0),
    sd_ch_e0 = sd(change_e0)
  ) %>% 
  ungroup()

summary_stats
summary_stats %>% 
  left_join(
    dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015) %>% 
      select(gender, change_in_2015 = change_e0)
  ) %>% 
  mutate(
    prob_2015_chance = pmap_dbl(
      list(
        q = change_in_2015,
        mean = mean_ch_e0,
        sd = sd_ch_e0
      ),
      .f = pnorm
    ),
    one_in = 1/ prob_2015_chance,
    prob_no_improvovement = pmap_dbl(
      list(
        q = c(0,0),
        mean = mean_ch_e0,
        sd = sd_ch_e0
      ),
      .f = pnorm
    )
  )
## Joining, by = "gender"

Discussion

Appendix? Quick exploration: negative autocorrelation

The model above assumes that the change in life expectancy in any year is not dependent on that in the previous year. Put another way, the model is ‘memoryless’, and has not predictor terms (i.e. ‘right hand’ terms in regression formula). The next simplest model approach would be to build a model that includes the previous year’s change as a predictor variable. So, a one year lag term.

The prior hypothesis - consistent with ‘harvesting’ - is that the change in one year will be negatively correlated with the previous year.

This can be done by building a linear regression model with the previous year as a predictor variable

mdls <- dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  mutate(last_change_e0 = lag(change_e0)) %>% 
  filter(between(year, 1955, 2014)) %>% 
  group_by(gender) %>% 
  nest() %>% 
  mutate(mdl_nolag = map(data, ~lm(change_e0 ~ 1, data = .))) %>% 
  mutate(mdl_lag = map(data, ~lm(change_e0 ~ last_change_e0, data = .)))  %>% 
  mutate(anova = map2(mdl_nolag, mdl_lag, anova)) %>% 
  mutate(p_autocorrelation = map_dbl(anova, ~.[2,6])) %>% 
  mutate(model_lag_tidy = map(mdl_lag, broom::tidy))

lag_mdl_summary <- mdls %>% 
  select(gender, model_lag_tidy) %>% 
  unnest()
  
p_autocorr_female <- mdls %>% filter(gender == "female") %>% pull("p_autocorrelation")
p_autocorr_male <- mdls %>% filter(gender == "male") %>% pull("p_autocorrelation")

Two model specifications have been built.

As mdl_nolag is the same as mdl_lag with one of its terms set to zero, mdl_nolag is a restricted version of mdl_lag, and so ANOVA can be used to test whether the move from the unrestricted model (mdl_nolag) to the restricted (mdl_lag) is justified in terms of improved fit to the data. In effect, the ANOVA test gives us an estimated probability that the data contains autocorrelation. These probabilities are 1.410^{-4} for females, and 0.02706 for males, and so provide strong evidence that autocorrelation exists in the data.

The specific type of autocorrelation of interest is negative autocorrelation. Negative autocorrelation would be present if the estimate column for the term last_change_e0 were negative. The coefficient of this term is statistically significant at p < 0.05 if the corresponding row in the column p.value is below 0.05. We can see from the table below that both of these conditions are true, though the magnitude of the negative autocorrelation is larger for females than males, and the associated p value for females much smaller than for males.

lag_mdl_summary

One conclusion we might draw from this evidence that the data contains negative autocorrelation is that the most troubling aspect of changes in life expectancy in Scotland since 2014 is not necessarily the fall in life expectancy from 2014-15, but the lack of any notable recovery in life expectancy increases from 2015-16. Negative autocorrelation would imply that a year showing a substantial fall in life expectancy is more likely to be followed by a substantial improvement in life expectancy, above the long-term average improvement, the following year. Instead, life expectancy improvement in 2015-16 was close to zero.

Appendix: What if all available years were used for Scotland?

This supplementary analysis will explore the effect of using all available years for Scotland, rather than 1955-2014. This should be useful for avoiding any accusation of ‘cherry picking’ the selection of years.

dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(between(year, 1855, 2014)) %>% 
  ggplot(aes(x = change_e0)) + 
  stat_density(alpha = 0.2) +
  geom_rug() + 
  facet_grid(gender ~ .) +
  geom_vline(xintercept = 0, size = 1.5) +
  geom_vline(
    aes(xintercept =change_e0),
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015),
    colour = "red",
    linetype = "dashed"
  ) +
  geom_vline(
    aes(xintercept =mean_change_e0),
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(between(year, 1855, 2014)) %>% 
      group_by(gender) %>% 
      summarise(mean_change_e0 = mean(change_e0, na.rm = T)) %>% 
      ungroup(),
    colour = "blue"
  ) + 
  labs(
    title = "Range of falls and gains in annual life expectancy in Scotland, 1855-2014",
    x = "Change in life expectancy from previous year in years",
    y = "Density of distribution of changes"
  ) +
  geom_text(
    aes(x = mean_change_e0), y = 0.15, 
    label = "Average\nChange", fontface = "bold", colour = "white",
    nudge_y = 0.01, nudge_x = 0.0025,
    data = dta_e0 %>% 
          filter(code == "GBR_SCO") %>% 
          filter(gender != "total") %>% 
          group_by(gender) %>% 
          arrange(year) %>% 
          mutate(change_e0 = e0 - lag(e0)) %>% 
          filter(between(year, 1855, 2014)) %>% 
          group_by(gender) %>% 
          summarise(mean_change_e0 = mean(change_e0, na.rm = T)) %>% 
          ungroup()
  ) +
  geom_text(
    aes(x = mean_change_e0), y = 0.15, 
    label = "Average\nChange", fontface = "bold",
    data = dta_e0 %>% 
          filter(code == "GBR_SCO") %>% 
          filter(gender != "total") %>% 
          group_by(gender) %>% 
          arrange(year) %>% 
          mutate(change_e0 = e0 - lag(e0)) %>% 
          filter(between(year, 1855, 2014)) %>% 
          group_by(gender) %>% 
          summarise(mean_change_e0 = mean(change_e0, na.rm = T)) %>% 
          ungroup()
            ) +
  geom_text(
    aes(x = change_e0), y = 0.45, 
    label = "Change from\n2014-2015", fontface = "bold", colour = "white",
    nudge_y = 0.01, nudge_x = 0.0025,
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0), y = 0.45, 
    label = "Change from\n2014-2015", fontface = "bold",
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0), y = 0.45, 
    label = "Change from\n2014-2015", fontface = "bold",
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0, label = year), y = 0.1,
    data = dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(between(year, 1855, 2014)) %>% 
      mutate(rank_ch = rank(change_e0)) %>% 
      filter(rank_ch <= 2 | rank_ch >= 158)
  )
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_text).

It’s clear that there used to be much more variability in annual changes before the Second World War than after, and that a Normal distribution is much less appropriate. However if summarising the data in this way the mean and sd are as follows:

summary_stats_full <- dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(between(year, 1855, 2014)) %>% 
  group_by(gender) %>% 
  summarise(
    mean_ch_e0 = mean(change_e0, na.rm = T),
    sd_ch_e0 = sd(change_e0, na.rm = T)
  ) %>% 
  ungroup()

summary_stats_full

The mean increases were a bit higher, but the SDs were much, much higher.

If making the assumption that the kind of extreme variability in annual changes observed in the 19th century and first half of the 20th century should still be considered relevant, then the probability of seeing a fall as or more severe than seen in 2014-15 falls to one-in-3. (See below)

summary_stats_full %>% 
  left_join(
    dta_e0 %>% 
      filter(code == "GBR_SCO") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015) %>% 
      select(gender, change_in_2015 = change_e0)
  ) %>% 
  mutate(
    prob_2015_chance = pmap_dbl(
      list(
        q = change_in_2015,
        mean = mean_ch_e0,
        sd = sd_ch_e0
      ),
      .f = pnorm
    ),
    one_in = 1/ prob_2015_chance,
    prob_no_improvovement = pmap_dbl(
      list(
        q = c(0,0),
        mean = mean_ch_e0,
        sd = sd_ch_e0
      ),
      .f = pnorm
    )
  )
## Joining, by = "gender"

Appendix: How does this compare with England & Wales?

dta_e0 %>% 
  filter(code == "GBRCENW") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  ggplot(aes(x = year, y = change_e0)) + 
  geom_line() + geom_point() +
  facet_grid(gender ~ .) + 
  geom_hline(yintercept = 0) + 
  scale_x_continuous(breaks = seq(1840, 2010, by = 10)) + 
  annotate("rect", xmin = 1955, xmax = 2014, ymin = -Inf, ymax = Inf, alpha = 0.2) + 
  labs(
    title = "Annual changes in life expectancy over time in England & Wales, 1841-2016",
    subtitle = "Highlighted region: 1955-2014. Highlighted point: 2015",
    caption = "Source: HMD",
    x = "Year", 
    y = "Change in life expectancy from previous year in years"
  ) + 
  geom_point(
    aes(x = year, y = change_e0),
    size = 2, shape = 15, colour = "red",
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  )
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

The range of variability is much higher, due to much greater exposure to the 1918 influenza pandemic/end of First World War. Let’s now look again just at the range starting 1955

dta_e0 %>% 
  filter(code == "GBRCENW") %>% 
  filter(year >= 1955) %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  ggplot(aes(x = year, y = change_e0)) + 
  geom_line() + geom_point() +
  facet_grid(gender ~ .) + 
  geom_hline(yintercept = 0) + 
  scale_x_continuous(breaks = seq(1860, 2010, by = 10)) + 
  labs(
    title = "Annual changes in life expectancy over time in England & Wales, 1955-2016",
    subtitle = "Highlighted point: 2015",
    caption = "Source: HMD",
    x = "Year", 
    y = "Change in life expectancy from previous year in years"
  ) + 
  geom_point(
    aes(x = year, y = change_e0),
    size = 2, shape = 15, colour = "red",
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  )
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).

Now to show the distribution in comparison to 2014-15

dta_e0 %>% 
  filter(code == "GBRCENW") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(between(year, 1955, 2014)) %>% 
  ggplot(aes(x = change_e0)) + 
  stat_density(alpha = 0.2) +
  geom_rug() + 
  facet_grid(gender ~ .) +
  geom_vline(xintercept = 0, size = 1.5) +
  geom_vline(
    aes(xintercept =change_e0),
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015),
    colour = "red",
    linetype = "dashed"
  ) +
  geom_vline(
    aes(xintercept =mean_change_e0),
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(between(year, 1955, 2014)) %>% 
      group_by(gender) %>% 
      summarise(mean_change_e0 = mean(change_e0)) %>% 
      ungroup(),
    colour = "blue"
  ) + 
  labs(
    title = "Range of falls and gains in annual life expectancy in England & Wales, 1955-2014",
    x = "Change in life expectancy from previous year in years",
    y = "Density of distribution of changes"
  ) +
  geom_text(
    aes(x = mean_change_e0), y = 0.75, 
    label = "Average\nChange", fontface = "bold", colour = "white",
    nudge_y = 0.01, nudge_x = 0.0025,
    data = dta_e0 %>% 
          filter(code == "GBRCENW") %>% 
          filter(gender != "total") %>% 
          group_by(gender) %>% 
          arrange(year) %>% 
          mutate(change_e0 = e0 - lag(e0)) %>% 
          filter(between(year, 1955, 2014)) %>% 
          group_by(gender) %>% 
          summarise(mean_change_e0 = mean(change_e0)) %>% 
          ungroup()
  ) +
  geom_text(
    aes(x = mean_change_e0), y = 0.75, 
    label = "Average\nChange", fontface = "bold",
    data = dta_e0 %>% 
          filter(code == "GBRCENW") %>% 
          filter(gender != "total") %>% 
          group_by(gender) %>% 
          arrange(year) %>% 
          mutate(change_e0 = e0 - lag(e0)) %>% 
          filter(between(year, 1955, 2014)) %>% 
          group_by(gender) %>% 
          summarise(mean_change_e0 = mean(change_e0)) %>% 
          ungroup()
            ) +
  geom_text(
    aes(x = change_e0), y = 0.75, 
    label = "Change from\n2014-2015", fontface = "bold", colour = "white",
    nudge_y = 0.01, nudge_x = 0.0025,
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0), y = 0.75, 
    label = "Change from\n2014-2015", fontface = "bold",
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0), y = 0.75, 
    label = "Change from\n2014-2015", fontface = "bold",
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  ) +
  geom_text(
    aes(x = change_e0, label = year), y = 0.1,
    data = dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(between(year, 1955, 2014)) %>% 
      mutate(rank_ch = rank(change_e0)) %>% 
      filter(rank_ch <= 2 | rank_ch >= 59)
  )

The 2014-15 fall looks to be a comparably rarer event for England & Wales than for Scotland. Let’s quantify this

summary_stats <- dta_e0 %>% 
  filter(code == "GBRCENW") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  filter(between(year, 1955, 2014)) %>% 
  group_by(gender) %>% 
  summarise(
    mean_ch_e0 = mean(change_e0),
    sd_ch_e0 = sd(change_e0)
  ) %>% 
  ungroup()

summary_stats

And now the estimated probability/frequency

summary_stats %>% 
  left_join(
    dta_e0 %>% 
      filter(code == "GBRCENW") %>% 
      filter(gender != "total") %>% 
      group_by(gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015) %>% 
      select(gender, change_in_2015 = change_e0)
  ) %>% 
  mutate(
    prob_2015_chance = pmap_dbl(
      list(
        q = change_in_2015,
        mean = mean_ch_e0,
        sd = sd_ch_e0
      ),
      .f = pnorm
    ),
    one_in = 1/ prob_2015_chance,
    prob_no_improvovement = pmap_dbl(
      list(
        q = c(0,0),
        mean = mean_ch_e0,
        sd = sd_ch_e0
      ),
      .f = pnorm
    )
  )
## Joining, by = "gender"

The initial visual impression, based largely on just counting the ticks to the left of the red dashed line, was a bit misleading.

The 2014-15 fall was a 1-in-27 year event (cf 1-in-16 for Scotland) for males, and a 1-in-42 year event (cf 1-in-60) for females.

Appendix: visual exploration of 1 year and 2 year autocorrelation

This section will visually look at autocorrelation of 1 and 2 lags in Scotland, and England & Wales

dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  mutate(lag_1 = lag(change_e0)) %>% 
  mutate(lag_2 = lag(change_e0, 2)) %>% 
  filter(between(year, 1955, 2014)) %>%
  gather(key = "lag", value = "value", lag_1, lag_2) %>% 
  group_by(gender) %>% 
  ggplot(aes(x = change_e0, y = value)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
  facet_grid(lag ~ gender)

As might be expected, the autocorrelation with the previous year (lag_1) is stronger than the autocorrelation with two years previously (lag_2).

Let’s quantify this

dta_e0 %>% 
  filter(code == "GBR_SCO") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  mutate(lag_1 = lag(change_e0)) %>% 
  mutate(lag_2 = lag(change_e0, 2)) %>% 
  filter(between(year, 1955, 2014)) %>%
  gather(key = "lag", value = "value", lag_1, lag_2) %>% 
  group_by(gender, lag) %>% 
  nest() %>% 
  mutate(corr = map_dbl(data, ~ cor(.x[,c("change_e0", "value")]) %>% .[2,1])) %>% 
  select(-data) %>% 
  spread(lag, corr)

So, the change over a year is autocorrelated with the previous year’s change, and more weakly for two years’ prior.

The autocorrelation to the previous year’s change is -0.47 for females and -0.29 for males.

And now for England & Wales

dta_e0 %>% 
  filter(code == "GBRCENW") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  mutate(lag_1 = lag(change_e0)) %>% 
  mutate(lag_2 = lag(change_e0, 2)) %>% 
  filter(between(year, 1955, 2014)) %>%
  gather(key = "lag", value = "value", lag_1, lag_2) %>% 
  group_by(gender) %>% 
  ggplot(aes(x = change_e0, y = value)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
  facet_grid(lag ~ gender)

Lag-2 autocorrelation looks even weaker for England & Wales than for Scotland. Again, let’s quantify this

dta_e0 %>% 
  filter(code == "GBRCENW") %>% 
  filter(gender != "total") %>% 
  group_by(gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  mutate(lag_1 = lag(change_e0)) %>% 
  mutate(lag_2 = lag(change_e0, 2)) %>% 
  filter(between(year, 1955, 2014)) %>%
  gather(key = "lag", value = "value", lag_1, lag_2) %>% 
  group_by(gender, lag) %>% 
  nest() %>% 
  mutate(corr = map_dbl(data, ~ cor(.x[,c("change_e0", "value")]) %>% .[2,1])) %>% 
  select(-data) %>% 
  spread(lag, corr)

Pretty much no lag-2 autocorrelation.

But a very similar magnitude of lag-1 autocorrelation by sex as for Scotland.

Additional data production

The aim of this section is to produce a dataset that can be visualised in Excel. This dataset will show the change in e0 since 1955 for each country in the HMD.

Each column will be a different gender/country combination

tbl <- dta_e0 %>% 
  filter(gender != "total") %>% 
  group_by(code, gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0))  %>% 
  filter(year >= 1955) %>% 
  unite(sex_country, code, gender) %>% 
  select(sex_country, year, change_e0) %>% 
  spread(sex_country, change_e0, fill = NA_real_)
  
write_csv(tbl, path = "support/changes_since_1955.csv")

And now, to visualise for all countries

g <- dta_e0 %>% 
  filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>% 
  filter(gender != "total") %>% 
  group_by(code, gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0))  %>% 
  filter(year >= 1955) %>% 
  ggplot(aes(x = year, y = change_e0)) + 
  facet_grid(code~gender) + 
  geom_point() + geom_line() + 
  geom_hline(yintercept = 0) + 
  labs(
    title = "Annual changes in life expectancy over time, 1955-2016, all countries",
    subtitle = "Highlighted point: 2015",
    caption = "Source: HMD",
    x = "Year", 
    y = "Change in life expectancy from previous year in years"
  ) + 
  geom_point(
    aes(x = year, y = change_e0),
    size = 2, shape = 15, colour = "red",
    data = dta_e0 %>% 
      filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>% 
      filter(gender != "total") %>% 
      group_by(code, gender) %>% 
      arrange(year) %>% 
      mutate(change_e0 = e0 - lag(e0)) %>% 
      filter(year == 2015)
  )

print(g)
## Warning: Removed 36 rows containing missing values (geom_point).

ggsave("support/all_countries_change_since_1955.png", height = 200, width = 25, units = "cm", dpi = 300, limitsize = FALSE)
## Warning: Removed 36 rows containing missing values (geom_point).

Comparison of 0, 1, and 2 lag models for all countries

The aim of this section is to generalise the analyses shown earlier for Scotland, and England & Wales, to all HMD countries

dta_e0 %>% 
  filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>% 
  filter(gender != "total") %>% 
  group_by(code, gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>%
  mutate(lag_1 = lag(change_e0)) %>% 
  mutate(lag_2 = lag(change_e0, 2)) %>%
  filter(between(year, 1955, 2014)) %>% 
  group_by(code, gender) %>% 
  nest() %>% 
  mutate(
    m0 = map(data, ~lm(change_e0 ~ 1,              data = .)),
    m1 = map(data, ~lm(change_e0 ~ lag_1,          data = .)),
    m2 = map(data, ~lm(change_e0 ~ lag_1 + lag_2,  data = .))
  ) %>% 
  select(-data) %>% 
  gather(key = "lag", value = "model", m0:m2) %>% 
  mutate(
    aic = map_dbl(model, AIC),
    bic = map_dbl(model, BIC)
  ) %>% 
  mutate(lag = substr(lag, 2,2) %>% as.integer) %>% 
  ggplot(aes(x = lag, y = bic, group = gender, colour = gender, shape = gender)) + 
  facet_wrap(~code, scales = "free_y") + 
  geom_point() + geom_line()

This indicates that a simple ‘oscillatory’ pattern of annual changes - a bad year predicting a good year and vice versa - is far from a universal pattern, and often differs significantly between genders even in the same country. Often, but not always, more terms lead to a better penalised model fit (using BIC, which tends to be more conservative than AIC).

Let’s now explore simply the magnitude and direction of lag 1 negative autocorrelation in the data

dta_e0 %>% 
  filter(!(code %in% c("NZL_NM" ,"NZL_MA"))) %>% 
  filter(gender != "total") %>% 
  group_by(code, gender) %>% 
  arrange(year) %>% 
  mutate(change_e0 = e0 - lag(e0)) %>% 
  mutate(lag_1 = lag(change_e0)) %>% 
  mutate(lag_2 = lag(change_e0, 2)) %>% 
  filter(between(year, 1955, 2014)) %>%
  gather(key = "lag", value = "value", lag_1, lag_2) %>% 
  group_by(code, gender, lag) %>% 
  nest() %>% 
  mutate(corr = map_dbl(data, ~ cor(.x[,c("change_e0", "value")]) %>% .[2,1])) %>% 
  select(-data) %>% 
  spread(lag, corr) %>% 
  filter(!is.na(lag_1)) %>% 
  ggplot(aes(x = lag_1, y = reorder(code, lag_1), shape = gender, colour = gender)) + 
  geom_point() +
  geom_vline(xintercept = 0) + 
  labs(
    x = "Correlation with previous year",
    y = "Country",
    title = "Autocorrelation between annual changes in life expectancy and change in previous year",
    subtitle = "Period: 1955-2014 or nearest available range of years",
    caption = "Source: HMD"
       )

This is important for establishing a general principle: Though a 1 lag model does not universally outperform 0 or 2 lag models, it is generally the case that a negative autocorrelation exists between changes in one year and changes in the following year. i.e. it is generally correct to assume that life expectancy changes ‘oscillate’ rather than are a pure random walk.