An analysis of the Global Multidimensional Poverty Index (MPI) data for various countries and regions around the world.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(knitr)
library(scales)
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(cluster)
Load the dataset
data <- read.csv('hdx_hapi_poverty_rate_global.csv', stringsAsFactors = FALSE)
Clean the dataset
data_cleaned <- data[-1, ]
colnames(data_cleaned) <- c('location_code', 'has_hrp', 'in_gho', 'provider_admin1_name', 'admin1_code', 'admin1_name', 'mpi', 'headcount_ratio', 'intensity_of_deprivation', 'vulnerable_to_poverty', 'in_severe_poverty', 'reference_period_start', 'reference_period_end')
Convert numeric columns to appropriate data types
numeric_columns <- c('mpi', 'headcount_ratio', 'intensity_of_deprivation', 'vulnerable_to_poverty', 'in_severe_poverty')
data_cleaned[numeric_columns] <- lapply(data_cleaned[numeric_columns], as.numeric)
Convert date columns to Date format
data_cleaned$reference_period_start <- as.Date(data_cleaned$reference_period_start)
data_cleaned$reference_period_end <- as.Date(data_cleaned$reference_period_end)
summary_statistics <- summary(data_cleaned[numeric_columns])
kable(summary_statistics)
| mpi | headcount_ratio | intensity_of_deprivation | vulnerable_to_poverty | in_severe_poverty | |
|---|---|---|---|---|---|
| Min. :0.0000 | Min. : 0.000 | Min. : 0.00 | Min. : 0.000 | Min. : 0.0000 | |
| 1st Qu.:0.0244 | 1st Qu.: 6.168 | 1st Qu.:39.60 | 1st Qu.: 7.924 | 1st Qu.: 0.8403 | |
| Median :0.1114 | Median :25.771 | Median :44.26 | Median :15.203 | Median : 7.2929 | |
| Mean :0.1772 | Mean :34.124 | Mean :45.45 | Mean :15.284 | Mean :17.9962 | |
| 3rd Qu.:0.3021 | 3rd Qu.:60.373 | 3rd Qu.:50.65 | 3rd Qu.:21.586 | 3rd Qu.:30.3959 | |
| Max. :0.7384 | Max. :99.492 | Max. :75.89 | Max. :50.182 | Max. :94.1871 |
country_summary <- data_cleaned %>%
group_by(location_code) %>%
summarise(across(all_of(numeric_columns), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(all_of(numeric_columns), mean, na.rm = TRUE)`.
## ℹ In group 1: `location_code = "AFG"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
top_10_mpi <- country_summary %>%
arrange(desc(mpi)) %>%
head(10)
kable(top_10_mpi)
| location_code | mpi | headcount_ratio | intensity_of_deprivation | vulnerable_to_poverty | in_severe_poverty |
|---|---|---|---|---|---|
| TCD | 0.6016106 | 91.17772 | 65.43287 | 6.021026 | 75.62268 |
| NER | 0.5840333 | 86.73578 | 66.26881 | 6.285494 | 73.88559 |
| CAF | 0.4812875 | 81.79876 | 57.81771 | 12.028612 | 56.96986 |
| BFA | 0.4523393 | 77.59735 | 57.07156 | 10.758389 | 54.13713 |
| MLI | 0.4380071 | 76.17523 | 56.35877 | 11.521079 | 54.06448 |
| MDG | 0.4156371 | 73.61946 | 55.69504 | 13.203784 | 51.15671 |
| BDI | 0.3993667 | 73.53615 | 53.53338 | 16.032307 | 44.61298 |
| GIN | 0.3915037 | 68.89243 | 55.52337 | 14.826467 | 46.19869 |
| COD | 0.3903879 | 73.83552 | 52.37478 | 16.288131 | 45.14210 |
| MOZ | 0.3785306 | 66.33908 | 54.61580 | 14.225853 | 44.76680 |
ggplot(top_10_mpi, aes(x = reorder(location_code, mpi), y = mpi)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Countries with Highest MPI",
x = "Country Code",
y = "Multidimensional Poverty Index (MPI)") +
theme_minimal()
correlation_matrix <- cor(data_cleaned[numeric_columns], use = "complete.obs")
kable(correlation_matrix)
| mpi | headcount_ratio | intensity_of_deprivation | vulnerable_to_poverty | in_severe_poverty | |
|---|---|---|---|---|---|
| mpi | 1.0000000 | 0.9863885 | 0.8938514 | 0.1293102 | 0.9851414 |
| headcount_ratio | 0.9863885 | 1.0000000 | 0.8704221 | 0.2439347 | 0.9481129 |
| intensity_of_deprivation | 0.8938514 | 0.8704221 | 1.0000000 | 0.1601427 | 0.8860146 |
| vulnerable_to_poverty | 0.1293102 | 0.2439347 | 0.1601427 | 1.0000000 | -0.0077697 |
| in_severe_poverty | 0.9851414 | 0.9481129 | 0.8860146 | -0.0077697 | 1.0000000 |
ggplot(data = reshape2::melt(correlation_matrix), aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), color = "white", size = 3) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Correlation Heatmap of Poverty Indicators")
time_series_data <- data_cleaned %>%
group_by(reference_period_start) %>%
summarise(across(all_of(numeric_columns), mean, na.rm = TRUE))
ggplot(time_series_data, aes(x = reference_period_start, y = mpi)) +
geom_line() +
geom_point() +
labs(title = "Global MPI Trend Over Time",
x = "Year",
y = "Average Multidimensional Poverty Index (MPI)") +
theme_minimal()
regional_summary <- data_cleaned %>%
group_by(admin1_name) %>%
summarise(across(all_of(numeric_columns), mean, na.rm = TRUE)) %>%
arrange(desc(mpi))
top_20_regions <- head(regional_summary, 20)
kable(top_20_regions)
| admin1_name | mpi | headcount_ratio | intensity_of_deprivation | vulnerable_to_poverty | in_severe_poverty |
|---|---|---|---|---|---|
| Lac | 0.7065000 | 97.53510 | 72.42900 | 1.882833 | 89.57247 |
| Sila | 0.6988333 | 97.66173 | 71.55697 | 1.989833 | 91.64073 |
| Wadi Fira | 0.6970667 | 97.54923 | 71.45373 | 1.486067 | 92.58220 |
| Kanem | 0.6898000 | 98.49097 | 70.02897 | 1.197033 | 89.57427 |
| Salamat | 0.6888000 | 96.94567 | 71.00207 | 2.318067 | 86.88500 |
| Hadjer-Lamis | 0.6810333 | 97.21193 | 70.03547 | 2.024000 | 87.17007 |
| Batha | 0.6776333 | 96.22830 | 70.41970 | 2.549567 | 87.64880 |
| Tillaberi | 0.6738000 | 94.91290 | 70.86310 | 3.797850 | 84.53650 |
| Maradi | 0.6686500 | 95.32000 | 70.13090 | 2.834650 | 85.78585 |
| Tahoua | 0.6652000 | 95.09980 | 69.90930 | 3.682850 | 84.77675 |
| Ouaddai | 0.6634000 | 94.30803 | 70.32827 | 2.835467 | 86.95670 |
| Dosso | 0.6608000 | 95.26330 | 69.30980 | 2.964050 | 83.78345 |
| Ennedi Est | 0.6598000 | 98.12420 | 67.23630 | 1.340800 | 90.38860 |
| Zinder | 0.6569500 | 94.51395 | 69.47955 | 3.782800 | 84.22380 |
| Barh-El-Gazel | 0.6525000 | 96.80120 | 67.41683 | 2.260833 | 85.61973 |
| Chari-Baguirmi | 0.6507000 | 96.39190 | 67.50610 | 2.862033 | 82.71860 |
| Guera | 0.6451000 | 94.25793 | 68.38773 | 4.521300 | 81.49673 |
| Sahel | 0.6356500 | 92.65400 | 68.65210 | 3.987200 | 83.63785 |
| Diffa | 0.6079000 | 94.05595 | 64.57360 | 3.268150 | 80.85905 |
| Ennedi Ouest | 0.6024000 | 93.43470 | 64.46930 | 4.597100 | 80.63540 |
ggplot(top_20_regions, aes(x = reorder(admin1_name, mpi), y = mpi)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(title = "Top 20 Regions with Highest MPI",
x = "Region",
y = "Multidimensional Poverty Index (MPI)") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8))
regional_summary <- data_cleaned %>%
group_by(admin1_name) %>%
summarise(
mean_mpi = mean(mpi, na.rm = TRUE),
sd_mpi = sd(mpi, na.rm = TRUE),
min_mpi = min(mpi, na.rm = TRUE),
max_mpi = max(mpi, na.rm = TRUE)
)
anova_result <- aov(mpi ~ admin1_name, data = data_cleaned)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## admin1_name 1000 68.05 0.06805 5.099 <2e-16 ***
## Residuals 2164 28.88 0.01335
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
temporal_trends <- data_cleaned %>%
group_by(location_code, admin1_name) %>%
arrange(reference_period_start) %>%
summarise(
start_mpi = first(mpi),
end_mpi = last(mpi),
years = as.numeric(difftime(last(reference_period_end), first(reference_period_start), units = "days")) / 365,
avg_annual_change = (end_mpi - start_mpi) / years
)
## `summarise()` has grouped output by 'location_code'. You can override using the
## `.groups` argument.
ggplot(data_cleaned, aes(x = reference_period_start, y = mpi, color = admin1_name)) +
geom_line() +
theme_minimal() +
labs(title = "MPI Trends Over Time", x = "Year", y = "MPI", color = "Region")
indicators <- c("mpi", "headcount_ratio", "intensity_of_deprivation")
data_cleaned %>%
gather(key = "indicator", value = "value", indicators) %>%
group_by(admin1_name, indicator) %>%
summarise(mean_value = mean(value, na.rm = TRUE)) %>%
ggplot(aes(x = admin1_name, y = mean_value, fill = indicator)) +
geom_bar(stat = "identity", position = "stack") +
theme_minimal() +
labs(title = "MPI Composition by Region", x = "Region", y = "Contribution to MPI")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(indicators)
##
## # Now:
## data %>% select(all_of(indicators))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'admin1_name'. You can override using the
## `.groups` argument.
set.seed(123)
kmeans_result <- kmeans(data_cleaned[c("mpi", "headcount_ratio", "intensity_of_deprivation")], centers = 5)
data_cleaned$cluster <- kmeans_result$cluster
ggplot(data_cleaned, aes(x = headcount_ratio, y = intensity_of_deprivation, color = factor(cluster))) +
geom_point() +
theme_minimal() +
labs(title = "Cluster Analysis of Poverty Profiles", color = "Cluster")
gini_coefficients <- data_cleaned %>%
group_by(admin1_name) %>%
summarise(gini_mpi = ineq::Gini(mpi))
ggplot(data_cleaned, aes(x = cumsum(mpi) / sum(mpi), y = seq_along(mpi) / length(mpi))) +
geom_line() +
geom_abline(linetype = "dashed") +
theme_minimal() +
labs(title = "Lorenz Curve of MPI", x = "Cumulative Share of MPI", y = "Cumulative Share of Population")
This is an analysis of the Global Multidimensional Poverty Index data revealing variations in poverty levels across countries and regions, as well as over time..