How has life expectancy changed across income and regional aggregates since 2000?
# Libraries used in class
library(dplyr)
library(tidyr)
library(ggplot2)
# Find the downloaded World Bank CSV (Life expectancy: SP.DYN.LE00.IN)
wb_file <- list.files(pattern = "^API_SP\\.DYN\\.LE00\\.IN.*\\.csv$")
if (length(wb_file) == 0) {
stop("Place the World Bank CSV for SP.DYN.LE00.IN in this folder.
The filename usually starts with 'API_SP.DYN.LE00.IN_'.")
}
life_raw <- read.csv(wb_file[1], skip = 4, check.names = TRUE, stringsAsFactors = FALSE)
# Quick peek
head(life_raw[ , 1:8])
## Country.Name Country.Code
## 1 Aruba ABW
## 2 Africa Eastern and Southern AFE
## 3 Afghanistan AFG
## 4 Africa Western and Central AFW
## 5 Angola AGO
## 6 Albania ALB
## Indicator.Name Indicator.Code X1960 X1961
## 1 Life expectancy at birth, total (years) SP.DYN.LE00.IN 64.04900 64.21500
## 2 Life expectancy at birth, total (years) SP.DYN.LE00.IN 44.16966 44.46884
## 3 Life expectancy at birth, total (years) SP.DYN.LE00.IN 32.79900 33.29100
## 4 Life expectancy at birth, total (years) SP.DYN.LE00.IN 37.77964 38.05896
## 5 Life expectancy at birth, total (years) SP.DYN.LE00.IN 37.93300 36.90200
## 6 Life expectancy at birth, total (years) SP.DYN.LE00.IN 56.41300 57.48800
## X1962 X1963
## 1 64.60200 64.94400
## 2 44.87789 45.16058
## 3 33.75700 34.20100
## 4 38.68179 38.93692
## 5 37.16800 37.41900
## 6 58.49400 59.47900
# Keep country name, code, and years 2000 to 2020
life_wide <- life_raw %>%
select(Country.Name, Country.Code, starts_with("X200"), X2010:X2020)
# Reshape to tidy long format
life_long <- life_wide %>%
pivot_longer(
cols = starts_with("X"),
names_to = "Year",
values_to = "LifeExpectancy"
) %>%
mutate(
Year = as.integer(sub("^X", "", Year))
)
# Basic NA check
total_na <- sum(is.na(life_long$LifeExpectancy))
total_rows <- nrow(life_long)
data.frame(total_rows = total_rows, total_na = total_na)
## total_rows total_na
## 1 5586 21
# Global mean life expectancy by year
by_year <- life_long %>%
group_by(Year) %>%
summarize(mean_life = mean(LifeExpectancy, na.rm = TRUE))
head(by_year)
## # A tibble: 6 × 2
## Year mean_life
## <int> <dbl>
## 1 2000 67.2
## 2 2001 67.5
## 3 2002 67.8
## 4 2003 68.1
## 5 2004 68.5
## 6 2005 68.8
# Focus on a few aggregates that are already in the file
focus_groups <- c("High income", "Low income", "Sub-Saharan Africa")
life_focus <- life_long %>%
filter(Country.Name %in% focus_groups, Year >= 2000, Year <= 2020)
# Yearly means for selected aggregates
by_group_year <- life_focus %>%
group_by(Country.Name, Year) %>%
summarize(mean_life = mean(LifeExpectancy, na.rm = TRUE))
head(by_group_year)
## # A tibble: 6 × 3
## # Groups: Country.Name [1]
## Country.Name Year mean_life
## <chr> <int> <dbl>
## 1 High income 2000 76.0
## 2 High income 2001 76.3
## 3 High income 2002 76.4
## 4 High income 2003 76.5
## 5 High income 2004 77.0
## 6 High income 2005 77.1
# A simple count example: how many countries have non-missing values each year
availability <- life_long %>%
filter(Year >= 2000, Year <= 2020) %>%
group_by(Year) %>%
summarize(n_non_missing = sum(!is.na(LifeExpectancy)))
head(availability)
## # A tibble: 6 × 2
## Year n_non_missing
## <int> <int>
## 1 2000 265
## 2 2001 265
## 3 2002 265
## 4 2003 265
## 5 2004 265
## 6 2005 265
# Insight 1: Global change from 2000 to 2020
global_2000 <- by_year %>% filter(Year == 2000) %>% pull(mean_life)
global_2020 <- by_year %>% filter(Year == 2020) %>% pull(mean_life)
global_change <- round(global_2020 - global_2000, 1)
cat("Insight 1: Global average life expectancy rose by about",
global_change, "years from 2000 to 2020.\n")
## Insight 1: Global average life expectancy rose by about 5 years from 2000 to 2020.
# Insight 2: Gap between High income and Sub-Saharan Africa in 2020
hi_2020 <- by_group_year %>%
filter(Country.Name == "High income", Year == 2020) %>%
pull(mean_life)
ssa_2020 <- by_group_year %>%
filter(Country.Name == "Sub-Saharan Africa", Year == 2020) %>%
pull(mean_life)
gap_2020 <- round(hi_2020 - ssa_2020, 1)
cat("Insight 2: In 2020 the gap between High income and Sub-Saharan Africa was about",
gap_2020, "years.\n")
## Insight 2: In 2020 the gap between High income and Sub-Saharan Africa was about 18 years.
ggplot(by_year, aes(x = Year, y = mean_life)) +
geom_line(color = "steelblue", size = 1) +
labs(title = "Global Life Expectancy (2000–2020)",
x = "Year",
y = "Average Life Expectancy")