This R notebook explores child mortality by countries.
Gapminder (2015); UN Inter-agency Group for Child Mortality Estimation (2025) – processed by Our World in Data. “Child mortality rate – Gapminder; UN IGME – Long-run data” [dataset]. United Nations Inter-agency Group for Child Mortality Estimation, “United Nations Inter-agency Group for Child Mortality Estimation”; Gapminder, “Child mortality rate under age five v7”; Gapminder based on UN IGME & UN WPP, “Under-five Mortality v11”; Various sources, “Population” [original data]. Source: Gapminder (2015); UN Inter-agency Group for Child Mortality Estimation (2025) – processed by Our World In Data (https://ourworldindata.org/grapher/child-mortality)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(terra)
## terra 1.8.93
##
## Attaching package: 'terra'
##
## The following object is masked from 'package:tidyr':
##
## extract
library(tidyterra)
##
## Attaching package: 'tidyterra'
##
## The following object is masked from 'package:stats':
##
## filter
mortality.data <- read.csv("child-mortality.csv")
# Remove non-country specific aggregates
mortality.data <- subset(
mortality.data, (Code != "")
)
head(mortality.data, 10)
## Entity Code Year Child.mortality.rate
## 1 Afghanistan AFG 1957 37.13
## 2 Afghanistan AFG 1958 36.52
## 3 Afghanistan AFG 1959 35.95
## 4 Afghanistan AFG 1960 35.32
## 5 Afghanistan AFG 1961 34.76
## 6 Afghanistan AFG 1962 34.23
## 7 Afghanistan AFG 1963 33.68
## 8 Afghanistan AFG 1964 33.17
## 9 Afghanistan AFG 1965 32.65
## 10 Afghanistan AFG 1966 32.15
Describe the Year in the dataset.
q.values <- quantile(unique(mortality.data$Year))
q.data <- as.data.frame(q.values)
colnames(q.data) <- c("Year")
q.data
## Year
## 0% 1751
## 25% 1819
## 50% 1887
## 75% 1955
## 100% 2023
List distinct countries in the dataset with record count and Year range.
#data.frame(Entity = sort(unique(mortality.data$Entity)))
mortality.data %>%
group_by(Entity) %>%
summarise(
Year.Min = min(Year),
Year.Max = max(Year),
Count = n()
) %>%
arrange(Entity)
## # A tibble: 202 × 4
## Entity Year.Min Year.Max Count
## <chr> <int> <int> <int>
## 1 Afghanistan 1957 2023 67
## 2 Albania 1978 2023 46
## 3 Algeria 1954 2023 70
## 4 Andorra 1985 2023 39
## 5 Angola 1980 2023 44
## 6 Anguilla 1985 2023 39
## 7 Antigua and Barbuda 1949 2023 75
## 8 Argentina 1911 2023 108
## 9 Armenia 1976 2023 48
## 10 Australia 1870 2023 154
## # ℹ 192 more rows
Display a histogram of mortality rate distribution for the dataset.
ggplot(
mortality.data,
aes(x = Child.mortality.rate)
) +
geom_histogram(
#aes(y = ..density..),
bins = 30,
fill = "royalblue",
col = "navy"
) +
labs(
title = "Histogram of Child Mortality Rate",
x = "Mortality Rate",
y = "Frequency"
) +
theme_bw()
Aggregate the Mortality Rate by Year.
chart.data <- mortality.data %>%
group_by(Year) %>%
summarise(
Min.Rate = min(Child.mortality.rate),
Max.Rate = max(Child.mortality.rate),
Mean.Rate = round(mean(Child.mortality.rate), 2)
) %>%
melt(
id.vars = "Year",
variable.name = "Variable",
value.name = "Value"
) %>%
arrange(Year, Variable)
head(chart.data, 10)
## Year Variable Value
## 1 1751 Min.Rate 29.98
## 2 1751 Max.Rate 29.98
## 3 1751 Mean.Rate 29.98
## 4 1752 Min.Rate 35.97
## 5 1752 Max.Rate 35.97
## 6 1752 Mean.Rate 35.97
## 7 1753 Min.Rate 30.34
## 8 1753 Max.Rate 30.34
## 9 1753 Mean.Rate 30.34
## 10 1754 Min.Rate 32.69
Create a plot showing minimum, maximum, and average mortality rate by year.
ggplot(
chart.data,
aes(x = Year, y = Value, col = Variable)
) +
geom_line(
lwd = 0.7
) +
labs(
title = "Child Mortality Rate through the Years",
x = "Year",
y = "Mortality Rate"
) +
scale_color_manual(
values = c("blue", "red", "darkgreen")
) +
theme_bw()
Produce a regression chart of mortality rate over the years.
ggplot(
mortality.data,
aes(Year, Child.mortality.rate)
) +
geom_point(
col = "cornflowerblue"
) +
geom_smooth(
col = "coral"
) +
labs(
title = "Child Mortality Rate Regression over Years",
x = "Years",
y = "Mortality Rate"
) +
theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Find top 10 observations with the highest mortality rate.
mortality.data %>%
arrange(desc(Child.mortality.rate)) %>%
head(10)
## Entity Code Year Child.mortality.rate
## 1 South Sudan SSD 1988 76.74
## 2 Barbados BRB 1906 68.21
## 3 Barbados BRB 1912 67.56
## 4 Barbados BRB 1914 65.45
## 5 Barbados BRB 1921 65.12
## 6 Finland FIN 1868 63.66
## 7 Iceland ISL 1846 60.83
## 8 Barbados BRB 1923 59.76
## 9 Iceland ISL 1882 58.75
## 10 Barbados BRB 1919 57.16
Show mortality rates distribution for top 20 countries with largest spread.
sum.data <- mortality.data %>%
group_by(Code) %>%
summarise(
Mortality.Rate.Min = min(Child.mortality.rate),
Mortality.Rate.Max = max(Child.mortality.rate),
Spread = (Mortality.Rate.Max - Mortality.Rate.Min),
Count = n()
) %>%
arrange(desc(Spread)) %>%
head(20)
sum.data
## # A tibble: 20 × 5
## Code Mortality.Rate.Min Mortality.Rate.Max Spread Count
## <chr> <dbl> <dbl> <dbl> <int>
## 1 BRB 1 68.2 67.2 118
## 2 SSD 9.87 76.7 66.9 70
## 3 FIN 0.23 63.7 63.4 158
## 4 ISL 0.26 60.8 60.6 186
## 5 IRQ 2.26 57.1 54.8 82
## 6 CHL 0.67 55.2 54.6 123
## 7 DEU 0.37 53.9 53.6 161
## 8 KHM 2.29 54.1 51.8 49
## 9 RUS 0.45 49.9 49.4 111
## 10 SWE 0.25 49.0 48.7 273
## 11 ROU 0.65 47.8 47.1 148
## 12 MUS 1.47 48.4 46.9 151
## 13 AUT 0.31 47.1 46.8 190
## 14 HUN 0.38 46.4 46.1 142
## 15 SGP 0.21 46.3 46.1 92
## 16 PRK 1.74 46.3 44.6 74
## 17 YEM 3.93 48.2 44.3 72
## 18 ITA 0.28 44.5 44.2 152
## 19 MLI 9.13 49.8 40.7 64
## 20 IND 2.77 43.4 40.6 107
ggplot(sum.data) +
geom_bar(
mapping = aes(x = Code, y = Count),
stat = "identity",
col = "navy",
fill = "royalblue"
) +
labs(
title = "Observation Count by Country",
x = "Country",
y = "Observations"
) +
theme_bw()
chart.data <- subset(mortality.data, Code %in% unique(sum.data$Code))
ggplot(chart.data) +
geom_boxplot(
mapping = aes(x = Code, y = Child.mortality.rate),
fill = "royalblue",
staplewidth = 0.5,
outlier.color = "red"
) +
labs(
title = paste("Child Mortality Rate Distribution for",
"Top 20 Countries with Highest Deviation"),
x = "Country",
y = "Mortality Rate"
) +
theme_bw()
Get a count of observations by year to see what years have the densest amount of observations.
sum.data <- mortality.data %>%
group_by(Year) %>%
summarise(Countries = n()) %>%
arrange(Year)
ggplot(sum.data) +
geom_line(
mapping = aes(x = Year, y = Countries),
stat = "identity",
col = "royalblue",
linewidth = 1
) +
labs(
title = "Number of Countries with Mortality Rate"
) +
theme_bw()
It appears not all countries were reporting child mortality rates until 1980s. We’ll select only data since since 1980 to perform spatial analysis.
# Load world spatial vector data for country outlines
file.path <- "../world-ash-ms.geojson"
world.data <- vect(file.path) %>%
subset(
.$iso_a3 != "ATA",
select = c("iso_a3", "sov_a3")
) %>%
mutate(
iso_a3 = ifelse(sov_a3 == "FR1", "FRA", iso_a3)
)
Combine the spatial data with mean mortality rate.
sum.data <- mortality.data %>%
subset(.$Year >= 1980) %>%
group_by(Code) %>%
summarise(
Average.Rate = round(mean(Child.mortality.rate), 3)
)
map.data <- merge(
world.data,
sum.data,
na.rm = TRUE,
by.y = "Code",
by.x = "iso_a3"
)
head(map.data, 10)
## iso_a3 sov_a3 Average.Rate
## 1 CRI CRI 1.402
## 2 NIC NIC 4.432
## 3 FRA FR1 0.643
## 4 HTI HTI 11.043
## 5 DOM DOM 4.729
## 6 SLV SLV 3.987
## 7 GTM GTM 5.754
## 8 CUB CU1 1.034
## 9 HND HND 4.217
## 10 USA US1 0.913
ggplot() +
geom_spatvector(
data = world.data,
fill = "lightgray"
) +
geom_spatvector(
data = map.data,
mapping = aes(fill = Average.Rate),
col = "black"
) +
scale_fill_viridis_c(
direction = 1
) +
labs(
title = "Average Child Mortality Rate by Country",
subtitle = "-- Since 1980 --"
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)