ETW3483 IA2 Storyboard

Load Required Libraries

library(readr)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)
library(ggcorrplot)
library(caret)

Loading required package: lattice

Load and Rename Datasets

child_mortality <- read_csv("child-mortality.csv") %>%
  rename(Country = Entity, Year = Year, Child_Mortality_Rate = `Child mortality rate`)

Rows: 16655 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Entity, Code
dbl (2): Year, Child mortality rate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

extreme_poverty <- read_csv("share-of-population-living-in-extreme-poverty .csv") %>%
  rename(Extreme_Poverty_Share = `Share below $2.15 a day`)

Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Share below $2.15 a day

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

mean_income <- read_csv("mean-income-or-consumption-per-day.csv") %>%
  rename(Mean_Income = `Mean income or consumption per day`)

Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Mean income or consumption per day

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

poorest_10 <- read_csv("threshold-income-or-consumption-per-day-marking-the-poorest-decile.csv") %>%
  rename(Poorest_10_Threshold = `Threshold income or consumption per day marking the poorest decile`)

Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Threshold income or consumption per day marking the poorest d...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

richest_10 <- read_csv("threshold-income-or-consumption-per-day-marking-the-richest-decile.csv") %>%
  rename(Richest_10_Threshold = `Threshold income or consumption per day marking the richest decile`)

Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Threshold income or consumption per day marking the richest d...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

fertility_rate <- read_csv("fertility-rate-children-per-woman.csv") %>%
  rename(Country = Entity, Fertility_Rate = `Fertility rate - Sex: all - Age: all - Variant: estimates`)

Rows: 18722 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Entity
dbl (2): Year, Fertility rate - Sex: all - Age: all - Variant: estimates

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

median_age <- read_csv("median-age.csv") %>%
  rename(Country = Entity, Median_Age = `Median age - Sex: all - Age: all - Variant: estimates`)

Rows: 18722 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Entity
dbl (2): Year, Median age - Sex: all - Age: all - Variant: estimates

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dependency_ratio <- read_csv("total-dependency-ratio.csv") %>%
  rename(Country = Entity, Dependency_Ratio = `Total dependency ratio - Sex: all - Variant: estimates`)

Rows: 18944 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Entity
dbl (2): Year, Total dependency ratio - Sex: all - Variant: estimates

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Merge and Clean Data

final_data <- child_mortality %>%
  full_join(extreme_poverty, by = c("Country", "Year")) %>%
  full_join(mean_income, by = c("Country", "Year")) %>%
  full_join(poorest_10, by = c("Country", "Year")) %>%
  full_join(richest_10, by = c("Country", "Year")) %>%
  full_join(fertility_rate, by = c("Country", "Year")) %>%
  full_join(median_age, by = c("Country", "Year")) %>%
  full_join(dependency_ratio, by = c("Country", "Year"))

poverty_mortality_data <- final_data %>%
  filter(Year >= 2000) %>%
  filter(
    !is.na(Child_Mortality_Rate),
    !is.na(Extreme_Poverty_Share),
    !is.na(Mean_Income),
    !is.na(Poorest_10_Threshold),
    !is.na(Richest_10_Threshold)
  )

Region Mapping

poverty_mortality_data <- poverty_mortality_data %>%
  mutate(
    Region = case_when(
      Country %in% c("Afghanistan", "Bangladesh", "India", "Nepal", "Pakistan", "Sri Lanka") ~ "South Asia",
      
      Country %in% c("Algeria", "Angola", "Benin", "Botswana", "Burkina Faso", "Burundi", 
                     "Cameroon", "Cape Verde", "Central African Republic", "Chad", "Comoros", 
                     "Democratic Republic of Congo", "Djibouti", "Egypt", "Equatorial Guinea", 
                     "Eritrea", "Eswatini", "Ethiopia", "Gabon", "Gambia", "Ghana", "Guinea", 
                     "Guinea-Bissau", "Kenya", "Lesotho", "Liberia", "Libya", "Madagascar", 
                     "Malawi", "Mali", "Mauritania", "Mauritius", "Morocco", "Mozambique", 
                     "Namibia", "Niger", "Nigeria", "Rwanda", "Sao Tome and Principe", 
                     "Senegal", "Seychelles", "Sierra Leone", "Somalia", "South Africa", 
                     "South Sudan", "Sudan", "Tanzania", "Togo", "Tunisia", "Uganda", 
                     "Zambia", "Zimbabwe") ~ "Sub-Saharan Africa",
      
      Country %in% c("Brunei", "Cambodia", "East Timor", "Indonesia", "Laos", "Malaysia", 
                     "Myanmar", "Philippines", "Singapore", "Thailand", "Vietnam") ~ "Southeast Asia",
      
      Country %in% c("Argentina", "Belize", "Bolivia", "Brazil", "Chile", "Colombia", 
                     "Costa Rica", "Cuba", "Dominican Republic", "Ecuador", "El Salvador", 
                     "Guatemala", "Honduras", "Mexico", "Nicaragua", "Panama", "Paraguay", 
                     "Peru", "Uruguay", "Venezuela") ~ "Latin America",
      
      Country %in% c("China", "Japan", "Mongolia", "North Korea", "South Korea", "Taiwan") ~ "East Asia",
      
      Country %in% c("Bahrain", "Iran", "Iraq", "Israel", "Jordan", "Kuwait", "Lebanon", 
                     "Oman", "Palestine", "Qatar", "Saudi Arabia", "Syria", "United Arab Emirates", 
                     "Yemen") ~ "Middle East",
      
      Country %in% c("Albania", "Austria", "Belarus", "Belgium", "Bosnia and Herzegovina", "Bulgaria", 
                     "Croatia", "Czechia", "Denmark", "Estonia", "Finland", "France", "Germany", 
                     "Greece", "Hungary", "Ireland", "Italy", "Kosovo", "Latvia", "Lithuania", 
                     "Moldova", "Netherlands", "North Macedonia", "Norway", "Poland", "Portugal", 
                     "Romania", "Russia", "Serbia", "Slovakia", "Slovenia", "Spain", "Sweden", 
                     "Switzerland", "Ukraine", "United Kingdom") ~ "Europe",
      
      Country %in% c("Kazakhstan", "Kyrgyzstan", "Tajikistan", "Turkmenistan", "Uzbekistan") ~ "Central Asia",
      
      Country %in% c("Australia", "Fiji", "Kiribati", "Marshall Islands", "Micronesia (country)", 
                     "Nauru", "New Zealand", "Palau", "Papua New Guinea", "Samoa", "Solomon Islands", 
                     "Tonga", "Tuvalu", "Vanuatu") ~ "Oceania",
      
      TRUE ~ NA_character_
    )
  )

Regional Aggregation

regional_mortality <- poverty_mortality_data %>%
  filter(Year >= 2020 & !is.na(Region)) %>%
  group_by(Region) %>%
  summarise(Average_Mortality = mean(Child_Mortality_Rate, na.rm = TRUE)) %>%
  arrange(desc(Average_Mortality))

Set color scheme

region_colors <- c(
  "Sub-Saharan Africa" = "#D7263D",  # Red
  "South Asia" = "#66A61E",         # Green
  "Southeast Asia" = "#1C9EDB",     # Blue
  "Latin America" = "#FF8C42",      # Orange
  "East Asia" = "#7E57C2",          # Purple
  "Middle East" = "#FFB74D",        # Light Orange
  "Europe" = "#42A5F5",             # Light Blue
  "Central Asia" = "#009688",       # Teal
  "Oceania" = "#AB47BC"             # Violet
)

BAR CHART

ggplot(regional_mortality, aes(x = reorder(Region, Average_Mortality), y = Average_Mortality, fill = Region)) +
  geom_col() +
  geom_text(aes(label = round(Average_Mortality, 2)), hjust = -0.1, size = 6, color = "black") +
  scale_fill_manual(values = region_colors) +
  coord_flip() +
  labs(
    title = "Average Under-5 Child Mortality Rate by Region (2020–2023)",
    x = "Region",
    y = "Mortality Rate (deaths per 1,000 live births)",
    caption = "Source: Our World in Data, UN Population Division"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

PIE CHART

regional_share <- regional_mortality %>%
  mutate(
    Share = Average_Mortality / sum(Average_Mortality),
    Label = paste0(Region, "\n", round(Share * 100, 1), "%")
  )

ggplot(regional_share, aes(x = "", y = Share, fill = Region)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(Share * 100, 1), "%")),
            position = position_stack(vjust = 0.5), size = 2, color = "white") +
  scale_fill_manual(values = region_colors) +
  labs(
    title = "Child Mortality Rate Share by Region (2020–2023)",
    caption = "Source: Our World in Data, UN Population Division"
  ) +
  theme_void(base_size = 13) +
  theme(legend.title = element_blank())

Scatter Plot 1: Mean Income vs Child Mortality Rate

scatter_data1 <- poverty_mortality_data %>%
  filter(Year >= 2020, !is.na(Mean_Income), !is.na(Child_Mortality_Rate), !is.na(Region))

ggplot(scatter_data1, aes(x = Mean_Income, y = Child_Mortality_Rate, color = Region)) +
  geom_point(size = 2.8, alpha = 0.85) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +
  scale_color_manual(values = region_colors) +
  labs(
    title = "Higher Income is Linked to Lower Child Mortality",
    subtitle = "Sub-Saharan Africa stands out with low income and high mortality rates",
    x = "Mean Daily Income (USD)",
    y = "Under-5 Mortality Rate (per 1,000 live births)",
    caption = "Each dot represents a country | Source: Our World in Data"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.title = element_blank())

`geom_smooth()` using formula = 'y ~ x'

Scatter Plot 2: Fertility Rate vs Child Mortality Rate

scatter_data2 <- poverty_mortality_data %>%
  filter(Year >= 2020, !is.na(Fertility_Rate), !is.na(Child_Mortality_Rate), !is.na(Region))

ggplot(scatter_data2, aes(x = Fertility_Rate, y = Child_Mortality_Rate, color = Region)) +
  geom_point(size = 2.8, alpha = 0.85) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +
  scale_color_manual(values = region_colors) +
  labs(
    title = "Higher Fertility Is Associated with Higher Child Mortality",
    subtitle = "Sub-Saharan Africa again shows clustering of high fertility and mortality rates",
    x = "Fertility Rate (Children per Woman)",
    y = "Under-5 Mortality Rate (per 1,000 live births)",
    caption = "Each dot represents a country | Source: Our World in Data"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.title = element_blank())

`geom_smooth()` using formula = 'y ~ x'

Slide 4: Correlation Heatmap

# Select Y, X, and M variables from the final cleaned dataset
corr_data <- poverty_mortality_data %>%
  select(
    Child_Mortality_Rate,                  # Y variable
    Extreme_Poverty_Share, Mean_Income,    # X variables
    Poorest_10_Threshold, Richest_10_Threshold,
    Fertility_Rate,                        # X variable
    Median_Age, Dependency_Ratio           # M variables
  )

# Compute correlation matrix (using complete observations)
cor_matrix <- cor(corr_data, use = "complete.obs")

# Plot the correlation heatmap
ggcorrplot(
  cor_matrix,
  method = "square",
  type = "lower",
  lab = TRUE,
  lab_size = 3,
  colors = c("#D7263D", "white", "#1C9EDB"), # Red to white to blue scale
  title = "Correlation Heatmap: Predictors & Under-5 Mortality",
  ggtheme = ggplot2::theme_minimal()
)

Multiple Linear Regression Model

# Filter Relevant Variables
model_data <- poverty_mortality_data %>%
  select(
    Child_Mortality_Rate,
    Extreme_Poverty_Share,
    Mean_Income,
    Poorest_10_Threshold,
    Richest_10_Threshold,
    Fertility_Rate,
    Median_Age,
    Dependency_Ratio
  ) %>%
  na.omit()  # Remove missing rows

# Train-Test Split (70/30)

set.seed(123)  # for reproducibility
train_index <- createDataPartition(model_data$Child_Mortality_Rate, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]


# Fit Base Model (Multiple Linear Regression)

base_model <- lm(Child_Mortality_Rate ~ ., data = train_data)

# Predict on Test Set

predictions <- predict(base_model, newdata = test_data)

# Calculate R-squared on Test Set

r_squared <- cor(predictions, test_data$Child_Mortality_Rate)^2
cat("R-squared (Test Set):", round(r_squared, 3), "\n")

R-squared (Test Set): 0.835

# Optional: View model summary
summary(base_model)


Call:
lm(formula = Child_Mortality_Rate ~ ., data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1262 -0.4587 -0.0021  0.3795 15.8635 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -0.819027   0.445432  -1.839 0.066213 .  
Extreme_Poverty_Share  0.084323   0.004145  20.342  < 2e-16 ***
Mean_Income           -0.201833   0.054192  -3.724 0.000205 ***
Poorest_10_Threshold   0.131564   0.037889   3.472 0.000535 ***
Richest_10_Threshold   0.078735   0.023254   3.386 0.000734 ***
Fertility_Rate         1.780150   0.110252  16.146  < 2e-16 ***
Median_Age             0.016602   0.009912   1.675 0.094208 .  
Dependency_Ratio      -0.031734   0.007559  -4.198  2.9e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.285 on 1152 degrees of freedom
Multiple R-squared:  0.8135,    Adjusted R-squared:  0.8124 
F-statistic: 718.1 on 7 and 1152 DF,  p-value: < 2.2e-16

# Calculate RMSE
rmse <- sqrt(mean((predictions - test_data$Child_Mortality_Rate)^2))

# Model coefficients summary
model_coef <- summary(base_model)$coefficients

Conclusion

This analysis demonstrates how income, fertility, and poverty related indicators strongly predict under-5 mortality across global regions particularly Sub-Saharan Africa and South Asia.