Question

How has life expectancy changed across income and regional aggregates since 2000?

1. Import

# Libraries used in class
library(dplyr)
library(tidyr)
library(ggplot2)

# Find the downloaded World Bank CSV (Life expectancy: SP.DYN.LE00.IN)
wb_file <- list.files(pattern = "^API_SP\\.DYN\\.LE00\\.IN.*\\.csv$")

if (length(wb_file) == 0) {
  stop("Place the World Bank CSV for SP.DYN.LE00.IN in this folder. 
       The filename usually starts with 'API_SP.DYN.LE00.IN_'.")
}

life_raw <- read.csv(wb_file[1], skip = 4, check.names = TRUE, stringsAsFactors = FALSE)

# Quick peek
head(life_raw[ , 1:8])
##                  Country.Name Country.Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
## 6                     Albania          ALB
##                            Indicator.Name Indicator.Code    X1960    X1961
## 1 Life expectancy at birth, total (years) SP.DYN.LE00.IN 64.04900 64.21500
## 2 Life expectancy at birth, total (years) SP.DYN.LE00.IN 44.16966 44.46884
## 3 Life expectancy at birth, total (years) SP.DYN.LE00.IN 32.79900 33.29100
## 4 Life expectancy at birth, total (years) SP.DYN.LE00.IN 37.77964 38.05896
## 5 Life expectancy at birth, total (years) SP.DYN.LE00.IN 37.93300 36.90200
## 6 Life expectancy at birth, total (years) SP.DYN.LE00.IN 56.41300 57.48800
##      X1962    X1963
## 1 64.60200 64.94400
## 2 44.87789 45.16058
## 3 33.75700 34.20100
## 4 38.68179 38.93692
## 5 37.16800 37.41900
## 6 58.49400 59.47900

2. Format

# Keep country name, code, and years 2000 to 2020
life_wide <- life_raw %>%
  select(Country.Name, Country.Code, starts_with("X200"), X2010:X2020)

# Reshape to tidy long format
life_long <- life_wide %>%
  pivot_longer(
    cols = starts_with("X"),
    names_to = "Year",
    values_to = "LifeExpectancy"
  ) %>%
  mutate(
    Year = as.integer(sub("^X", "", Year))
  )

# Basic NA check
total_na <- sum(is.na(life_long$LifeExpectancy))
total_rows <- nrow(life_long)

data.frame(total_rows = total_rows, total_na = total_na)
##   total_rows total_na
## 1       5586       21

3. Summaries with dplyr

# Global mean life expectancy by year
by_year <- life_long %>%
  group_by(Year) %>%
  summarize(mean_life = mean(LifeExpectancy, na.rm = TRUE))

head(by_year)
## # A tibble: 6 × 2
##    Year mean_life
##   <int>     <dbl>
## 1  2000      67.2
## 2  2001      67.5
## 3  2002      67.8
## 4  2003      68.1
## 5  2004      68.5
## 6  2005      68.8
# Focus on a few aggregates that are already in the file
focus_groups <- c("High income", "Low income", "Sub-Saharan Africa")
life_focus <- life_long %>%
  filter(Country.Name %in% focus_groups, Year >= 2000, Year <= 2020)

# Yearly means for selected aggregates
by_group_year <- life_focus %>%
  group_by(Country.Name, Year) %>%
  summarize(mean_life = mean(LifeExpectancy, na.rm = TRUE))
head(by_group_year)
## # A tibble: 6 × 3
## # Groups:   Country.Name [1]
##   Country.Name  Year mean_life
##   <chr>        <int>     <dbl>
## 1 High income   2000      76.0
## 2 High income   2001      76.3
## 3 High income   2002      76.4
## 4 High income   2003      76.5
## 5 High income   2004      77.0
## 6 High income   2005      77.1
# A simple count example: how many countries have non-missing values each year
availability <- life_long %>%
  filter(Year >= 2000, Year <= 2020) %>%
  group_by(Year) %>%
  summarize(n_non_missing = sum(!is.na(LifeExpectancy)))
head(availability)
## # A tibble: 6 × 2
##    Year n_non_missing
##   <int>         <int>
## 1  2000           265
## 2  2001           265
## 3  2002           265
## 4  2003           265
## 5  2004           265
## 6  2005           265

4. Two insights

# Insight 1: Global change from 2000 to 2020
global_2000 <- by_year %>% filter(Year == 2000) %>% pull(mean_life)
global_2020 <- by_year %>% filter(Year == 2020) %>% pull(mean_life)
global_change <- round(global_2020 - global_2000, 1)

cat("Insight 1: Global average life expectancy rose by about",
    global_change, "years from 2000 to 2020.\n")
## Insight 1: Global average life expectancy rose by about 5 years from 2000 to 2020.
# Insight 2: Gap between High income and Sub-Saharan Africa in 2020
hi_2020 <- by_group_year %>%
  filter(Country.Name == "High income", Year == 2020) %>%
  pull(mean_life)

ssa_2020 <- by_group_year %>%
  filter(Country.Name == "Sub-Saharan Africa", Year == 2020) %>%
  pull(mean_life)

gap_2020 <- round(hi_2020 - ssa_2020, 1)

cat("Insight 2: In 2020 the gap between High income and Sub-Saharan Africa was about",
    gap_2020, "years.\n")
## Insight 2: In 2020 the gap between High income and Sub-Saharan Africa was about 18 years.

5. Visual

ggplot(by_year, aes(x = Year, y = mean_life)) +
  geom_line(color = "steelblue", size = 1) +
  labs(title = "Global Life Expectancy (2000–2020)",
       x = "Year",
       y = "Average Life Expectancy")