This R notebook explores child mortality by countries.

Dataset Citation

Gapminder (2015); UN Inter-agency Group for Child Mortality Estimation (2025) – processed by Our World in Data. “Child mortality rate – Gapminder; UN IGME – Long-run data” [dataset]. United Nations Inter-agency Group for Child Mortality Estimation, “United Nations Inter-agency Group for Child Mortality Estimation”; Gapminder, “Child mortality rate under age five v7”; Gapminder based on UN IGME & UN WPP, “Under-five Mortality v11”; Various sources, “Population” [original data]. Source: Gapminder (2015); UN Inter-agency Group for Child Mortality Estimation (2025) – processed by Our World In Data (https://ourworldindata.org/grapher/child-mortality)


library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(terra)
## terra 1.8.93
## 
## Attaching package: 'terra'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(tidyterra)
## 
## Attaching package: 'tidyterra'
## 
## The following object is masked from 'package:stats':
## 
##     filter
mortality.data <- read.csv("child-mortality.csv")

# Remove non-country specific aggregates
mortality.data <- subset(
    mortality.data, (Code != "")
)

head(mortality.data, 10)
##         Entity Code Year Child.mortality.rate
## 1  Afghanistan  AFG 1957                37.13
## 2  Afghanistan  AFG 1958                36.52
## 3  Afghanistan  AFG 1959                35.95
## 4  Afghanistan  AFG 1960                35.32
## 5  Afghanistan  AFG 1961                34.76
## 6  Afghanistan  AFG 1962                34.23
## 7  Afghanistan  AFG 1963                33.68
## 8  Afghanistan  AFG 1964                33.17
## 9  Afghanistan  AFG 1965                32.65
## 10 Afghanistan  AFG 1966                32.15

Describe the Year in the dataset.

q.values <- quantile(unique(mortality.data$Year))
q.data <- as.data.frame(q.values)
colnames(q.data) <- c("Year")
q.data
##      Year
## 0%   1751
## 25%  1819
## 50%  1887
## 75%  1955
## 100% 2023

List distinct countries in the dataset with record count and Year range.

#data.frame(Entity = sort(unique(mortality.data$Entity)))

mortality.data %>%
    group_by(Entity) %>%
    summarise(
        Year.Min = min(Year),
        Year.Max = max(Year),
        Count = n()
    ) %>%
    arrange(Entity)
## # A tibble: 202 × 4
##    Entity              Year.Min Year.Max Count
##    <chr>                  <int>    <int> <int>
##  1 Afghanistan             1957     2023    67
##  2 Albania                 1978     2023    46
##  3 Algeria                 1954     2023    70
##  4 Andorra                 1985     2023    39
##  5 Angola                  1980     2023    44
##  6 Anguilla                1985     2023    39
##  7 Antigua and Barbuda     1949     2023    75
##  8 Argentina               1911     2023   108
##  9 Armenia                 1976     2023    48
## 10 Australia               1870     2023   154
## # ℹ 192 more rows

Display a histogram of mortality rate distribution for the dataset.

ggplot(
        mortality.data,         
        aes(x = Child.mortality.rate)
    ) +
    geom_histogram(
        #aes(y = ..density..),
        bins = 30,
        fill = "royalblue",
        col = "navy"
    ) +
    labs(
        title = "Histogram of Child Mortality Rate",
        x = "Mortality Rate",
        y = "Frequency"
    ) +
    theme_bw()

Aggregate the Mortality Rate by Year.

chart.data <- mortality.data %>%
    group_by(Year) %>%
    summarise(
        Min.Rate = min(Child.mortality.rate),
        Max.Rate = max(Child.mortality.rate),
        Mean.Rate = round(mean(Child.mortality.rate), 2)
    ) %>%
    melt(
        id.vars = "Year",
        variable.name = "Variable",
        value.name = "Value"
    ) %>%
    arrange(Year, Variable)

head(chart.data, 10)
##    Year  Variable Value
## 1  1751  Min.Rate 29.98
## 2  1751  Max.Rate 29.98
## 3  1751 Mean.Rate 29.98
## 4  1752  Min.Rate 35.97
## 5  1752  Max.Rate 35.97
## 6  1752 Mean.Rate 35.97
## 7  1753  Min.Rate 30.34
## 8  1753  Max.Rate 30.34
## 9  1753 Mean.Rate 30.34
## 10 1754  Min.Rate 32.69

Create a plot showing minimum, maximum, and average mortality rate by year.

ggplot(
        chart.data,
        aes(x = Year, y = Value, col = Variable)
    ) +
    geom_line(
        lwd = 0.7
    ) +
    labs(
        title = "Child Mortality Rate through the Years",
        x = "Year",
        y = "Mortality Rate"
    ) +
    scale_color_manual(
        values = c("blue", "red", "darkgreen")
    ) +
    
    theme_bw()

Produce a regression chart of mortality rate over the years.

ggplot(
        mortality.data, 
        aes(Year, Child.mortality.rate)
    ) +
    geom_point(
        col = "cornflowerblue"
    ) +
    geom_smooth(
        col = "coral"
    ) +
    labs(
        title = "Child Mortality Rate Regression over Years",
        x = "Years",
        y = "Mortality Rate"
    ) +
    theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Find top 10 observations with the highest mortality rate.

mortality.data %>%
    arrange(desc(Child.mortality.rate)) %>%
    head(10)
##         Entity Code Year Child.mortality.rate
## 1  South Sudan  SSD 1988                76.74
## 2     Barbados  BRB 1906                68.21
## 3     Barbados  BRB 1912                67.56
## 4     Barbados  BRB 1914                65.45
## 5     Barbados  BRB 1921                65.12
## 6      Finland  FIN 1868                63.66
## 7      Iceland  ISL 1846                60.83
## 8     Barbados  BRB 1923                59.76
## 9      Iceland  ISL 1882                58.75
## 10    Barbados  BRB 1919                57.16

Show mortality rates distribution for top 20 countries with largest spread.

sum.data <- mortality.data %>%
    group_by(Code) %>%
    summarise(
        Mortality.Rate.Min = min(Child.mortality.rate),
        Mortality.Rate.Max = max(Child.mortality.rate),
        Spread = (Mortality.Rate.Max - Mortality.Rate.Min),
        Count = n()
    ) %>%
    arrange(desc(Spread)) %>%
    head(20)

sum.data
## # A tibble: 20 × 5
##    Code  Mortality.Rate.Min Mortality.Rate.Max Spread Count
##    <chr>              <dbl>              <dbl>  <dbl> <int>
##  1 BRB                 1                  68.2   67.2   118
##  2 SSD                 9.87               76.7   66.9    70
##  3 FIN                 0.23               63.7   63.4   158
##  4 ISL                 0.26               60.8   60.6   186
##  5 IRQ                 2.26               57.1   54.8    82
##  6 CHL                 0.67               55.2   54.6   123
##  7 DEU                 0.37               53.9   53.6   161
##  8 KHM                 2.29               54.1   51.8    49
##  9 RUS                 0.45               49.9   49.4   111
## 10 SWE                 0.25               49.0   48.7   273
## 11 ROU                 0.65               47.8   47.1   148
## 12 MUS                 1.47               48.4   46.9   151
## 13 AUT                 0.31               47.1   46.8   190
## 14 HUN                 0.38               46.4   46.1   142
## 15 SGP                 0.21               46.3   46.1    92
## 16 PRK                 1.74               46.3   44.6    74
## 17 YEM                 3.93               48.2   44.3    72
## 18 ITA                 0.28               44.5   44.2   152
## 19 MLI                 9.13               49.8   40.7    64
## 20 IND                 2.77               43.4   40.6   107
ggplot(sum.data) +
    geom_bar(
        mapping = aes(x = Code, y = Count),
        stat = "identity", 
        col = "navy", 
        fill = "royalblue"
    ) +
    labs(
        title = "Observation Count by Country",
        x = "Country",
        y = "Observations"
    ) +
    theme_bw()

chart.data <- subset(mortality.data, Code %in% unique(sum.data$Code))

ggplot(chart.data) +
    geom_boxplot(
        mapping = aes(x = Code, y = Child.mortality.rate), 
        fill = "royalblue",
        staplewidth = 0.5,
        outlier.color = "red"
    ) +
    labs(
        title = paste("Child Mortality Rate Distribution for",
            "Top 20 Countries with Highest Deviation"),
        x = "Country",
        y = "Mortality Rate"
    ) +
    theme_bw()

Get a count of observations by year to see what years have the densest amount of observations.

sum.data <- mortality.data %>%
    group_by(Year) %>%
    summarise(Countries = n()) %>%
    arrange(Year)

ggplot(sum.data) +
    geom_line(
        mapping = aes(x = Year, y = Countries),
        stat = "identity", 
        col = "royalblue", 
        linewidth = 1
    ) +
    labs(
        title = "Number of Countries with Mortality Rate"
    ) +
    theme_bw()

It appears not all countries were reporting child mortality rates until 1980s. We’ll select only data since since 1980 to perform spatial analysis.

# Load world spatial vector data for country outlines

file.path <- "../world-ash-ms.geojson"

world.data <- vect(file.path) %>% 
    subset(
        .$iso_a3 != "ATA", 
        select = c("iso_a3", "sov_a3")
    ) %>%
    mutate(
        iso_a3 = ifelse(sov_a3 == "FR1", "FRA", iso_a3)
    )

Combine the spatial data with mean mortality rate.

sum.data <- mortality.data %>%
    subset(.$Year >= 1980) %>%
    group_by(Code) %>%
    summarise(
        Average.Rate = round(mean(Child.mortality.rate), 3)
    )

map.data <- merge(
        world.data, 
        sum.data, 
        na.rm = TRUE,
        by.y = "Code",
        by.x = "iso_a3"
    )

head(map.data, 10)
##    iso_a3 sov_a3 Average.Rate
## 1     CRI    CRI        1.402
## 2     NIC    NIC        4.432
## 3     FRA    FR1        0.643
## 4     HTI    HTI       11.043
## 5     DOM    DOM        4.729
## 6     SLV    SLV        3.987
## 7     GTM    GTM        5.754
## 8     CUB    CU1        1.034
## 9     HND    HND        4.217
## 10    USA    US1        0.913
ggplot() +
    geom_spatvector(
        data = world.data,
        fill = "lightgray"
    ) +
    geom_spatvector(
        data = map.data, 
        mapping = aes(fill = Average.Rate),
        col = "black"
    ) +
    scale_fill_viridis_c(
        direction = 1
    ) +
    labs(
        title = "Average Child Mortality Rate by Country",
        subtitle = "-- Since 1980 --"
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5)
    )