Rows: 16655 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Entity, Code
dbl (2): Year, Child mortality rate
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
extreme_poverty <-read_csv("share-of-population-living-in-extreme-poverty .csv") %>%rename(Extreme_Poverty_Share =`Share below $2.15 a day`)
Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Share below $2.15 a day
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mean_income <-read_csv("mean-income-or-consumption-per-day.csv") %>%rename(Mean_Income =`Mean income or consumption per day`)
Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Mean income or consumption per day
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
poorest_10 <-read_csv("threshold-income-or-consumption-per-day-marking-the-poorest-decile.csv") %>%rename(Poorest_10_Threshold =`Threshold income or consumption per day marking the poorest decile`)
Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Threshold income or consumption per day marking the poorest d...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
richest_10 <-read_csv("threshold-income-or-consumption-per-day-marking-the-richest-decile.csv") %>%rename(Richest_10_Threshold =`Threshold income or consumption per day marking the richest decile`)
Rows: 2705 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Country
dbl (2): Year, Threshold income or consumption per day marking the richest d...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fertility_rate <-read_csv("fertility-rate-children-per-woman.csv") %>%rename(Country = Entity, Fertility_Rate =`Fertility rate - Sex: all - Age: all - Variant: estimates`)
Rows: 18722 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Entity
dbl (2): Year, Fertility rate - Sex: all - Age: all - Variant: estimates
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
median_age <-read_csv("median-age.csv") %>%rename(Country = Entity, Median_Age =`Median age - Sex: all - Age: all - Variant: estimates`)
Rows: 18722 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Entity
dbl (2): Year, Median age - Sex: all - Age: all - Variant: estimates
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dependency_ratio <-read_csv("total-dependency-ratio.csv") %>%rename(Country = Entity, Dependency_Ratio =`Total dependency ratio - Sex: all - Variant: estimates`)
Rows: 18944 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Entity
dbl (2): Year, Total dependency ratio - Sex: all - Variant: estimates
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Merge and Clean Data
final_data <- child_mortality %>%full_join(extreme_poverty, by =c("Country", "Year")) %>%full_join(mean_income, by =c("Country", "Year")) %>%full_join(poorest_10, by =c("Country", "Year")) %>%full_join(richest_10, by =c("Country", "Year")) %>%full_join(fertility_rate, by =c("Country", "Year")) %>%full_join(median_age, by =c("Country", "Year")) %>%full_join(dependency_ratio, by =c("Country", "Year"))poverty_mortality_data <- final_data %>%filter(Year >=2000) %>%filter(!is.na(Child_Mortality_Rate),!is.na(Extreme_Poverty_Share),!is.na(Mean_Income),!is.na(Poorest_10_Threshold),!is.na(Richest_10_Threshold) )
ggplot(regional_mortality, aes(x =reorder(Region, Average_Mortality), y = Average_Mortality, fill = Region)) +geom_col() +geom_text(aes(label =round(Average_Mortality, 2)), hjust =-0.1, size =6, color ="black") +scale_fill_manual(values = region_colors) +coord_flip() +labs(title ="Average Under-5 Child Mortality Rate by Region (2020–2023)",x ="Region",y ="Mortality Rate (deaths per 1,000 live births)",caption ="Source: Our World in Data, UN Population Division" ) +theme_minimal(base_size =13) +theme(legend.position ="none")
PIE CHART
regional_share <- regional_mortality %>%mutate(Share = Average_Mortality /sum(Average_Mortality),Label =paste0(Region, "\n", round(Share *100, 1), "%") )ggplot(regional_share, aes(x ="", y = Share, fill = Region)) +geom_col(width =1, color ="white") +coord_polar(theta ="y") +geom_text(aes(label =paste0(round(Share *100, 1), "%")),position =position_stack(vjust =0.5), size =2, color ="white") +scale_fill_manual(values = region_colors) +labs(title ="Child Mortality Rate Share by Region (2020–2023)",caption ="Source: Our World in Data, UN Population Division" ) +theme_void(base_size =13) +theme(legend.title =element_blank())
Scatter Plot 1: Mean Income vs Child Mortality Rate
scatter_data1 <- poverty_mortality_data %>%filter(Year >=2020, !is.na(Mean_Income), !is.na(Child_Mortality_Rate), !is.na(Region))ggplot(scatter_data1, aes(x = Mean_Income, y = Child_Mortality_Rate, color = Region)) +geom_point(size =2.8, alpha =0.85) +geom_smooth(method ="lm", se =FALSE, color ="black", linetype ="dashed") +scale_color_manual(values = region_colors) +labs(title ="Higher Income is Linked to Lower Child Mortality",subtitle ="Sub-Saharan Africa stands out with low income and high mortality rates",x ="Mean Daily Income (USD)",y ="Under-5 Mortality Rate (per 1,000 live births)",caption ="Each dot represents a country | Source: Our World in Data" ) +theme_minimal(base_size =13) +theme(legend.title =element_blank())
`geom_smooth()` using formula = 'y ~ x'
Scatter Plot 2: Fertility Rate vs Child Mortality Rate
scatter_data2 <- poverty_mortality_data %>%filter(Year >=2020, !is.na(Fertility_Rate), !is.na(Child_Mortality_Rate), !is.na(Region))ggplot(scatter_data2, aes(x = Fertility_Rate, y = Child_Mortality_Rate, color = Region)) +geom_point(size =2.8, alpha =0.85) +geom_smooth(method ="lm", se =FALSE, color ="black", linetype ="dashed") +scale_color_manual(values = region_colors) +labs(title ="Higher Fertility Is Associated with Higher Child Mortality",subtitle ="Sub-Saharan Africa again shows clustering of high fertility and mortality rates",x ="Fertility Rate (Children per Woman)",y ="Under-5 Mortality Rate (per 1,000 live births)",caption ="Each dot represents a country | Source: Our World in Data" ) +theme_minimal(base_size =13) +theme(legend.title =element_blank())
`geom_smooth()` using formula = 'y ~ x'
Slide 4: Correlation Heatmap
# Select Y, X, and M variables from the final cleaned datasetcorr_data <- poverty_mortality_data %>%select( Child_Mortality_Rate, # Y variable Extreme_Poverty_Share, Mean_Income, # X variables Poorest_10_Threshold, Richest_10_Threshold, Fertility_Rate, # X variable Median_Age, Dependency_Ratio # M variables )# Compute correlation matrix (using complete observations)cor_matrix <-cor(corr_data, use ="complete.obs")# Plot the correlation heatmapggcorrplot( cor_matrix,method ="square",type ="lower",lab =TRUE,lab_size =3,colors =c("#D7263D", "white", "#1C9EDB"), # Red to white to blue scaletitle ="Correlation Heatmap: Predictors & Under-5 Mortality",ggtheme = ggplot2::theme_minimal())
Multiple Linear Regression Model
# Filter Relevant Variablesmodel_data <- poverty_mortality_data %>%select( Child_Mortality_Rate, Extreme_Poverty_Share, Mean_Income, Poorest_10_Threshold, Richest_10_Threshold, Fertility_Rate, Median_Age, Dependency_Ratio ) %>%na.omit() # Remove missing rows# Train-Test Split (70/30)set.seed(123) # for reproducibilitytrain_index <-createDataPartition(model_data$Child_Mortality_Rate, p =0.7, list =FALSE)train_data <- model_data[train_index, ]test_data <- model_data[-train_index, ]# Fit Base Model (Multiple Linear Regression)base_model <-lm(Child_Mortality_Rate ~ ., data = train_data)# Predict on Test Setpredictions <-predict(base_model, newdata = test_data)# Calculate R-squared on Test Setr_squared <-cor(predictions, test_data$Child_Mortality_Rate)^2cat("R-squared (Test Set):", round(r_squared, 3), "\n")
# Calculate RMSErmse <-sqrt(mean((predictions - test_data$Child_Mortality_Rate)^2))# Model coefficients summarymodel_coef <-summary(base_model)$coefficients
Conclusion
This analysis demonstrates how income, fertility, and poverty related indicators strongly predict under-5 mortality across global regions particularly Sub-Saharan Africa and South Asia.