# Load all required libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
library(lme4)
## Warning: package 'lme4' was built under R version 4.4.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(readr)
library(dplyr)
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
The analysis found that 13 countries, mostly in sub-Saharan Africa (AFR), account for 75% of the global HIV burden, with countries like South Africa, Nigeria, Kenya, and Mozambique showing persistently high case trends from 2000 to 2023. While some nations, such as South Africa, have seen gradual declines, the sub-Saharan Africa regions’ overall trends remains stagnant, unlike regions such as South-East Asia (SEAR) and the Americas (AMR) which showed slower growth. Faceted visualizations indicated that sub-Saharan Africa alone represents over 60% of global HIV cases. A mixed-effects model linking HIV data with World Bank poverty indicators revealed a strong positive relationship between HIV prevalence and multidimensional poverty (β = 0.058, p < 0.001), particularly driven by factors like lack of electricity access (r = 0.92), monetary deprivation (r = 0.94), and poor sanitation (r = 0.71). For every 10% rise in poverty, HIV cases increased by 5.8%. Regional disparities were clear: AFR countries with poverty rates above 40%, like Malawi and Zambia, had HIV rates three times higher than less impoverished nations. These findings highlight the deep cycle between poverty and HIV and stress the need for integrated policies that address economic inequality, infrastructure gaps, and healthcare access to reduce transmission.
# Read the file
hiv_data <- read_csv("HIV data 2000-2023.csv", locale = locale(encoding = "ISO-8859-1"))
## Rows: 1552 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): IndicatorCode, Indicator, ValueType, ParentLocationCode, ParentLoc...
## dbl (1): Period
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View column names
colnames(hiv_data)
## [1] "IndicatorCode" "Indicator" "ValueType"
## [4] "ParentLocationCode" "ParentLocation" "Location type"
## [7] "SpatialDimValueCode" "Location" "Period type"
## [10] "Period" "Value"
# Extract main number from 'Value' column
hiv_data <- hiv_data %>%
mutate(
MainValue = str_extract(Value, "^[\\d\\s]+") %>%
str_replace_all(" ", "") %>%
as.numeric()
)
# Finding the most recent year
latest_year <- max(hiv_data$Period, na.rm = TRUE)
latest_year
## [1] 2023
# Subsetting data for the latest year
latest_data <- hiv_data %>%
filter(Period == latest_year)
latest_data
## # A tibble: 194 × 12
## IndicatorCode Indicator ValueType ParentLocationCode ParentLocation
## <chr> <chr> <chr> <chr> <chr>
## 1 HIV_0000000001 Estimated number … numeric AFR Africa
## 2 HIV_0000000001 Estimated number … numeric AFR Africa
## 3 HIV_0000000001 Estimated number … numeric AFR Africa
## 4 HIV_0000000001 Estimated number … numeric AFR Africa
## 5 HIV_0000000001 Estimated number … numeric AFR Africa
## 6 HIV_0000000001 Estimated number … numeric AFR Africa
## 7 HIV_0000000001 Estimated number … numeric AFR Africa
## 8 HIV_0000000001 Estimated number … numeric AFR Africa
## 9 HIV_0000000001 Estimated number … numeric AFR Africa
## 10 HIV_0000000001 Estimated number … numeric AFR Africa
## # ℹ 184 more rows
## # ℹ 7 more variables: `Location type` <chr>, SpatialDimValueCode <chr>,
## # Location <chr>, `Period type` <chr>, Period <dbl>, Value <chr>,
## # MainValue <dbl>
# Calculating total global burden
total_global <- sum(latest_data$MainValue, na.rm = TRUE)
# Finding cumulative % contribution
top_countries_global <- latest_data %>%
arrange(desc(MainValue)) %>%
mutate(
cumulative_sum = cumsum(MainValue),
cumulative_percent = cumulative_sum / total_global
) %>%
filter(cumulative_percent <= 0.75)
top_countries_global
## # A tibble: 13 × 14
## IndicatorCode Indicator ValueType ParentLocationCode ParentLocation
## <chr> <chr> <chr> <chr> <chr>
## 1 HIV_0000000001 Estimated number … numeric AFR Africa
## 2 HIV_0000000001 Estimated number … numeric SEAR South-East As…
## 3 HIV_0000000001 Estimated number … numeric AFR Africa
## 4 HIV_0000000001 Estimated number … numeric AFR Africa
## 5 HIV_0000000001 Estimated number … numeric AFR Africa
## 6 HIV_0000000001 Estimated number … numeric AFR Africa
## 7 HIV_0000000001 Estimated number … numeric AFR Africa
## 8 HIV_0000000001 Estimated number … numeric AFR Africa
## 9 HIV_0000000001 Estimated number … numeric AFR Africa
## 10 HIV_0000000001 Estimated number … numeric AMR Americas
## 11 HIV_0000000001 Estimated number … numeric AFR Africa
## 12 HIV_0000000001 Estimated number … numeric AFR Africa
## 13 HIV_0000000001 Estimated number … numeric SEAR South-East As…
## # ℹ 9 more variables: `Location type` <chr>, SpatialDimValueCode <chr>,
## # Location <chr>, `Period type` <chr>, Period <dbl>, Value <chr>,
## # MainValue <dbl>, cumulative_sum <dbl>, cumulative_percent <dbl>
# Get list of top countries
top_countries_list <- top_countries_global$Location
# Filtering data and Plotting
hiv_data %>%
filter(Location %in% top_countries_list) %>%
ggplot(aes(x = Period, y = MainValue, color = Location)) +
geom_line(size = 1.2) +
scale_y_continuous(labels = comma) +
labs(
title = "Countries Contributing 75% to Global Burden",
x = "Year",
y = "Number of People Living with HIV",
color = "Country"
) +
theme(plot.title = element_text(hjust = 0.5))+
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Function to find top countries per region
find_top_75_region <- function(df_region) {
total_region <- sum(df_region$MainValue, na.rm = TRUE)
df_region %>%
arrange(desc(MainValue)) %>%
mutate(
cumulative_sum = cumsum(MainValue),
cumulative_percent = cumulative_sum / total_region
) %>%
filter(cumulative_percent <= 0.75)
}
# Apply function by region
top_countries_region <- latest_data %>%
group_by(ParentLocationCode) %>%
group_modify(~ find_top_75_region(.x)) %>%
ungroup()
top_countries_region
## # A tibble: 24 × 14
## ParentLocationCode IndicatorCode Indicator ValueType ParentLocation
## <chr> <chr> <chr> <chr> <chr>
## 1 AFR HIV_0000000001 Estimated number … numeric Africa
## 2 AFR HIV_0000000001 Estimated number … numeric Africa
## 3 AFR HIV_0000000001 Estimated number … numeric Africa
## 4 AFR HIV_0000000001 Estimated number … numeric Africa
## 5 AFR HIV_0000000001 Estimated number … numeric Africa
## 6 AFR HIV_0000000001 Estimated number … numeric Africa
## 7 AFR HIV_0000000001 Estimated number … numeric Africa
## 8 AMR HIV_0000000001 Estimated number … numeric Americas
## 9 AMR HIV_0000000001 Estimated number … numeric Americas
## 10 AMR HIV_0000000001 Estimated number … numeric Americas
## # ℹ 14 more rows
## # ℹ 9 more variables: `Location type` <chr>, SpatialDimValueCode <chr>,
## # Location <chr>, `Period type` <chr>, Period <dbl>, Value <chr>,
## # MainValue <dbl>, cumulative_sum <dbl>, cumulative_percent <dbl>
# List of top countries regionally
top_region_countries_list <- top_countries_region$Location
# Plot top countries regionally
hiv_data %>%
filter(Location %in% top_region_countries_list) %>%
ggplot(aes(x = Period, y = MainValue, color = Location)) +
geom_line(size = 1) +
facet_wrap(~ ParentLocationCode, scales = "free_y", ncol = 2) +
scale_y_continuous(labels = comma) +
labs(
title = "Countries Contributing 75% to Global Burden in Each WHO Region",
x = "Year",
y = "Number of People Living with HIV",
color = "Country"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
legend.text = element_text(size = 8),
plot.title = element_text(hjust = 0.5),
strip.text = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1) # rotate x-axis labels
)
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_line()`).
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
# Read HIV data already preprocessed in the existing code
hiv_data <- read_csv(
"HIV data 2000-2023.csv",
locale = locale(encoding = "ISO-8859-1"),
show_col_types = FALSE # Silence column type message
) %>%
mutate(
MainValue = as.numeric(str_replace_all(str_extract(Value, "^[\\d\\s]+"), " ", ""))
)
# Read Multidimensional Poverty data correctly
poverty_data <- read_csv(
"Multidimensional Poverty.csv",
skip = 2, # Skip the first two rows (title and empty row)
show_col_types = FALSE
) %>%
select(1:16) %>% # Keep relevant columns (adjust as needed)
rename(
Region = 1,
CountryCode = 2,
Country = 3,
Year = 4,
SurveyName = 5,
SurveyYear = 6,
SurveyCoverage = 7,
WelfareType = 8,
SurveyComparability = 9,
Monetary = 10,
Education = 11,
Enrollment = 12,
Electricity = 13,
Sanitation = 14,
Water = 15,
Poverty = 16 # Multidimensional poverty headcount ratio
) %>%
mutate(Country = case_when(
Country == "Congo, Democratic Republic of" ~ "Democratic Republic of the Congo",
Country == "Egypt, Arab Republic of" ~ "Egypt",
Country == "Iran, Islamic Republic of" ~ "Iran",
TRUE ~ Country
))
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...16`
glimpse(poverty_data)
## Rows: 154
## Columns: 16
## $ Region <chr> "SSA", "ECA", "LAC", "ECA", "EAP", "ECA", "SSA", "…
## $ CountryCode <chr> "AGO", "ALB", "ARG", "ARM", "AUS", "AUT", "BDI", "…
## $ Country <chr> "Angola", "Albania", "Argentina", "Armenia", "Aust…
## $ Year <dbl> 2018, 2012, 2010, 2010, 2010, 2009, 2013, 2009, 20…
## $ SurveyName <chr> "IDREA", "HBS", "EPHC-S2", "ILCS", "SIH-LIS", "EU-…
## $ SurveyYear <dbl> 2018, 2018, 2021, 2021, 2018, 2022, 2020, 2022, 20…
## $ SurveyCoverage <chr> "N", "N", "U", "N", "N", "N", "N", "N", "N", "N", …
## $ WelfareType <chr> "c", "c", "i", "c", "I", "i", "c", "i", "c", "c", …
## $ SurveyComparability <dbl> 2, 1, 3, 1, 3, 2, 1, 2, 1, 3, 2, 5, 1, 5, 0, 1, 3,…
## $ Monetary <dbl> 31.1, 0.0, 0.9, 0.5, 0.5, 0.5, 62.1, 0.0, 12.7, 25…
## $ Education <chr> "29.8", "0.2", "1.1", "0.0", "1.7", "0.2", "44.9",…
## $ Enrollment <chr> "27.4", "-", "0.7", "1.8", "-", "-", "34.2", "-", …
## $ Electricity <chr> "52.6", "0.1", "0.0", "0.0", "0.0", "0.0", "90.6",…
## $ Sanitation <chr> "53.6", "6.6", "0.3", "0.4", "0.0", "-", "91.0", "…
## $ Water <chr> "32.1", "9.6", "0.4", "0.7", "-", "0.0", "12.0", "…
## $ Poverty <dbl> 47.2, 0.3, 0.9, 0.5, 2.2, 0.7, 79.2, 0.7, 45.4, 53…
# Merge datasets
merged_data <- hiv_data %>%
inner_join(poverty_data, by = c("Location" = "Country", "Period" = "Year"))
glimpse(merged_data)
## Rows: 43
## Columns: 26
## $ IndicatorCode <chr> "HIV_0000000001", "HIV_0000000001", "HIV_000000000…
## $ Indicator <chr> "Estimated number of people (all ages) living with…
## $ ValueType <chr> "numeric", "numeric", "numeric", "numeric", "numer…
## $ ParentLocationCode <chr> "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AMR", "…
## $ ParentLocation <chr> "Africa", "Africa", "Africa", "Africa", "Africa", …
## $ `Location type` <chr> "Country", "Country", "Country", "Country", "Count…
## $ SpatialDimValueCode <chr> "BEN", "CIV", "GNB", "KEN", "MWI", "ZMB", "ARG", "…
## $ Location <chr> "Benin", "Cote d'Ivoire", "Guinea-Bissau", "Kenya"…
## $ `Period type` <chr> "Year", "Year", "Year", "Year", "Year", "Year", "Y…
## $ Period <dbl> 2015, 2015, 2010, 2015, 2010, 2010, 2010, 2010, 20…
## $ Value <chr> "71 000 [61 000 - 84 000]", "460 000 [420 000 - 51…
## $ MainValue <dbl> 71000, 460000, 37000, 1500000, 930000, 940000, 920…
## $ Region <chr> "SSA", "SSA", "SSA", "SSA", "SSA", "SSA", "LAC", "…
## $ CountryCode <chr> "BEN", "CIV", "GNB", "KEN", "MWI", "ZMB", "ARG", "…
## $ SurveyName <chr> "EHCVM", "EHCVM", "EHCVM", "KCHS", "IHS-V", "LCMS-…
## $ SurveyYear <dbl> 2021, 2021, 2021, 2021, 2019, 2022, 2021, 2021, 20…
## $ SurveyCoverage <chr> "N", "N", "N", "N", "N", "N", "U", "N", "N", "N", …
## $ WelfareType <chr> "c", "c", "c", "c", "c", "c", "i", "i", "i", "i", …
## $ SurveyComparability <dbl> 1, 2, 3, 3, 1, 4, 3, 4, 3, 5, 7, 4, 0, 5, 3, 3, 1,…
## $ Monetary <dbl> 12.7, 9.7, 26.0, 36.1, 70.1, 64.3, 0.9, 7.3, 1.2, …
## $ Education <chr> "49.0", "44.5", "20.1", "10.1", "54.3", "16.3", "1…
## $ Enrollment <chr> "31.7", "24.7", "31.1", "1.2", "3.7", "23.4", "0.7…
## $ Electricity <chr> "34.8", "9.3", "27.6", "24.5", "88.8", "45.1", "0.…
## $ Sanitation <chr> "76.6", "59.9", "60.4", "22.3", "75.1", "53.5", "0…
## $ Water <chr> "24.1", "17.3", "20.9", "36.1", "11.4", "26.8", "0…
## $ Poverty <dbl> 45.4, 29.2, 38.7, 38.5, 78.3, 66.5, 0.9, 7.7, 1.3,…
#converting merged data of poverty indicators to numeric
merged_data <- merged_data %>%
mutate(
across(
c(Education, Enrollment, Electricity, Sanitation, Water),
~ {
# Replace "-" or blanks with NA, then convert to numeric
cleaned <- ifelse(.x == "-" | .x == "", NA, .x)
as.numeric(cleaned)
}
)
)
# Verify column types
glimpse(merged_data)
## Rows: 43
## Columns: 26
## $ IndicatorCode <chr> "HIV_0000000001", "HIV_0000000001", "HIV_000000000…
## $ Indicator <chr> "Estimated number of people (all ages) living with…
## $ ValueType <chr> "numeric", "numeric", "numeric", "numeric", "numer…
## $ ParentLocationCode <chr> "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AMR", "…
## $ ParentLocation <chr> "Africa", "Africa", "Africa", "Africa", "Africa", …
## $ `Location type` <chr> "Country", "Country", "Country", "Country", "Count…
## $ SpatialDimValueCode <chr> "BEN", "CIV", "GNB", "KEN", "MWI", "ZMB", "ARG", "…
## $ Location <chr> "Benin", "Cote d'Ivoire", "Guinea-Bissau", "Kenya"…
## $ `Period type` <chr> "Year", "Year", "Year", "Year", "Year", "Year", "Y…
## $ Period <dbl> 2015, 2015, 2010, 2015, 2010, 2010, 2010, 2010, 20…
## $ Value <chr> "71 000 [61 000 - 84 000]", "460 000 [420 000 - 51…
## $ MainValue <dbl> 71000, 460000, 37000, 1500000, 930000, 940000, 920…
## $ Region <chr> "SSA", "SSA", "SSA", "SSA", "SSA", "SSA", "LAC", "…
## $ CountryCode <chr> "BEN", "CIV", "GNB", "KEN", "MWI", "ZMB", "ARG", "…
## $ SurveyName <chr> "EHCVM", "EHCVM", "EHCVM", "KCHS", "IHS-V", "LCMS-…
## $ SurveyYear <dbl> 2021, 2021, 2021, 2021, 2019, 2022, 2021, 2021, 20…
## $ SurveyCoverage <chr> "N", "N", "N", "N", "N", "N", "U", "N", "N", "N", …
## $ WelfareType <chr> "c", "c", "c", "c", "c", "c", "i", "i", "i", "i", …
## $ SurveyComparability <dbl> 1, 2, 3, 3, 1, 4, 3, 4, 3, 5, 7, 4, 0, 5, 3, 3, 1,…
## $ Monetary <dbl> 12.7, 9.7, 26.0, 36.1, 70.1, 64.3, 0.9, 7.3, 1.2, …
## $ Education <dbl> 49.0, 44.5, 20.1, 10.1, 54.3, 16.3, 1.1, 5.1, 3.7,…
## $ Enrollment <dbl> 31.7, 24.7, 31.1, 1.2, 3.7, 23.4, 0.7, 2.8, 0.5, 5…
## $ Electricity <dbl> 34.8, 9.3, 27.6, 24.5, 88.8, 45.1, 0.0, 1.1, 0.3, …
## $ Sanitation <dbl> 76.6, 59.9, 60.4, 22.3, 75.1, 53.5, 0.3, 7.7, 1.6,…
## $ Water <dbl> 24.1, 17.3, 20.9, 36.1, 11.4, 26.8, 0.4, 1.7, 0.3,…
## $ Poverty <dbl> 45.4, 29.2, 38.7, 38.5, 78.3, 66.5, 0.9, 7.7, 1.3,…
# Compute correlation matrix
cor_matrix <- merged_data %>%
select(MainValue, Monetary, Education, Enrollment, Electricity, Sanitation, Water, Poverty) %>%
cor(use = "complete.obs")
print(cor_matrix)
## MainValue Monetary Education Enrollment Electricity Sanitation
## MainValue 1.0000000 0.7977965 0.3685283 0.1217257 0.6585007 0.4685524
## Monetary 0.7977965 1.0000000 0.5495825 0.3416945 0.9226242 0.7078505
## Education 0.3685283 0.5495825 1.0000000 0.6293762 0.7348003 0.8603485
## Enrollment 0.1217257 0.3416945 0.6293762 1.0000000 0.3952902 0.7532342
## Electricity 0.6585007 0.9226242 0.7348003 0.3952902 1.0000000 0.8142255
## Sanitation 0.4685524 0.7078505 0.8603485 0.7532342 0.8142255 1.0000000
## Water 0.7661140 0.6895488 0.5200692 0.5974126 0.6127511 0.7159733
## Poverty 0.7265094 0.9414066 0.7547669 0.5781944 0.9479561 0.8860418
## Water Poverty
## MainValue 0.7661140 0.7265094
## Monetary 0.6895488 0.9414066
## Education 0.5200692 0.7547669
## Enrollment 0.5974126 0.5781944
## Electricity 0.6127511 0.9479561
## Sanitation 0.7159733 0.8860418
## Water 1.0000000 0.7867513
## Poverty 0.7867513 1.0000000
Converting the correlation matrix to a long format enables ggplot to map each variable pair and their correlation value to the heatmap’s axes and color fill.
# Turn cor_matrix into long format
cor_long <- as.data.frame(as.table(cor_matrix))
# Create heatmap
ggplot(cor_long, aes(Var1, Var2, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Freq, 2)), color = "black", size = 4) +
scale_fill_gradient2(low = "#D0E4F5", mid = "#73B2E3", high = "#1C4587", midpoint = 0,
limit = c(-1, 1), name = "Correlation") +
labs(title = "HIV and Poverty Indicators Confusion Matrix",
x = "", y = "") +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.text.y = element_text(size = 10),
plot.title = element_text(hjust = 0.5, face = "bold"),
panel.grid = element_blank())
# Scatterplot
ggplot(merged_data, aes(x = Poverty, y = MainValue)) +
geom_point() +
geom_smooth(method = "lm") +
theme(plot.title = element_text(hjust = 0.5))+
labs(title = "HIV Prevalence vs. Multidimensional Poverty", x = "Poverty Headcount (%)", y = "People Living with HIV")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Check unique countries and years
merged_data %>%
summarize(
Unique_Locations = n_distinct(Location),
Unique_Years = n_distinct(Period)
)
## # A tibble: 1 × 2
## Unique_Locations Unique_Years
## <int> <int>
## 1 43 3
# Find levels
length(unique(merged_data$Location))
## [1] 43
length(unique(merged_data$Period))
## [1] 3
# Check observations per group
table(merged_data$Location)
##
## Argentina Armenia Australia Bangladesh
## 1 1 1 1
## Belarus Benin Colombia Costa Rica
## 1 1 1 1
## Cote d'Ivoire Croatia Dominican Republic Ecuador
## 1 1 1 1
## Egypt El Salvador Georgia Germany
## 1 1 1 1
## Guinea-Bissau Honduras Israel Kazakhstan
## 1 1 1 1
## Kenya Malawi Mexico Mongolia
## 1 1 1 1
## Nepal North Macedonia Pakistan Panama
## 1 1 1 1
## Paraguay Peru Poland Romania
## 1 1 1 1
## Russian Federation Serbia Suriname Tonga
## 1 1 1 1
## Tunisia Ukraine Uruguay Uzbekistan
## 1 1 1 1
## Vanuatu Viet Nam Zambia
## 1 1 1
table(merged_data$Period)
##
## 2010 2015 2022
## 37 4 2
The HIV data is transformed in order to normalize it’s distribution since it is usually skewed.
#log transformation for HIV
merged_data <- merged_data %>%
mutate(log_HIV = log(MainValue + 1))
# Fit a linear mixed-effects model(without random effects)
model <- lm(
log_HIV ~ Monetary + Education + Electricity +Sanitation + Water + factor(Period),
data = merged_data
)
summary(model)
##
## Call:
## lm(formula = log_HIV ~ Monetary + Education + Electricity + Sanitation +
## Water + factor(Period), data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3693 -0.9678 -0.0651 0.9756 2.7090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.720689 0.438787 22.154 <2e-16 ***
## Monetary 0.069758 0.061170 1.140 0.265
## Education 0.015313 0.048477 0.316 0.755
## Electricity -0.026359 0.060034 -0.439 0.665
## Sanitation -0.002011 0.032517 -0.062 0.951
## Water 0.027122 0.089311 0.304 0.764
## factor(Period)2015 1.208908 2.027452 0.596 0.557
## factor(Period)2022 -0.104054 1.148419 -0.091 0.929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.502 on 24 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.4621, Adjusted R-squared: 0.3052
## F-statistic: 2.946 on 7 and 24 DF, p-value: 0.02243
# Check variance inflation factors (VIF) to assess multicollinearity
car::vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Monetary 15.282793 1 3.909321
## Education 6.531223 1 2.555626
## Electricity 16.798956 1 4.098653
## Sanitation 7.263723 1 2.695129
## Water 8.733494 1 2.955249
## factor(Period) 5.203218 2 1.510317
# Remove Electricity, Monetary, and Water
model_updated <- lm(
log_HIV ~ Education + Sanitation + factor(Period),
data = merged_data
)
summary(model_updated)
##
## Call:
## lm(formula = log_HIV ~ Education + Sanitation + factor(Period),
## data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4447 -0.9501 -0.0106 1.2525 2.4106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.839503 0.328783 29.927 <2e-16 ***
## Education -0.007397 0.037569 -0.197 0.845
## Sanitation 0.038537 0.023310 1.653 0.108
## factor(Period)2015 1.186208 1.113365 1.065 0.295
## factor(Period)2022 -0.407924 1.130090 -0.361 0.721
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.526 on 31 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.314, Adjusted R-squared: 0.2255
## F-statistic: 3.547 on 4 and 31 DF, p-value: 0.01704
# Recheck GVIF
library(car)
vif(model_updated)
## GVIF Df GVIF^(1/(2*Df))
## Education 4.006631 1 2.001657
## Sanitation 3.837274 1 1.958896
## factor(Period) 1.505177 2 1.107636
#Replace Education with Poverty and drop sanitation and period
model_final <- lm(
log_HIV ~ Poverty,
data = merged_data
)
summary(model_final)
##
## Call:
## lm(formula = log_HIV ~ Poverty, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6757 -1.0721 -0.1507 1.1143 2.5930
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.73760 0.27108 35.922 < 2e-16 ***
## Poverty 0.05801 0.01257 4.615 5.1e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.437 on 35 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.3783, Adjusted R-squared: 0.3606
## F-statistic: 21.3 on 1 and 35 DF, p-value: 5.098e-05
ggplot(merged_data, aes(x = Poverty, y = log_HIV)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Poverty vs. log_HIV")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
plot(model_final, which = 1) # Residuals vs. Fitted
The analysis of under-five and neonatal mortality rates in East African Community (EAC) countries revealed that Somalia had the highest under-five mortality rate at 104.02 deaths per 1,000 live births, while South Sudan had the highest neonatal mortality rate at 40.24 deaths per 1,000 live births (2023 data). Choropleth maps and trend plots revealed regional declines, although Somalia and South Sudan continue to face substantial health-care system issues. To align with the shapefiles, data merging required the standardization of country names. The findings indicate critical disparities in healthcare in the EAC, emphasizing the importance of targeted efforts to reduce child mortality in the most vulnerable nations.
# Prepare mortality data
prepare_data <- function(df) {
df %>%
mutate(Year = as.numeric(substr(TIME_PERIOD, 1, 4))) %>%
group_by(Geographic.area) %>%
filter(Year == max(Year, na.rm = TRUE)) %>%
ungroup()
}
# Define EAC countries (adjust based on your shapefile's NAME values)
eac_countries <- c("Kenya", "Uganda", "Tanzania", "Rwanda", "Burundi",
"South Sudan", "Congo DRC", "Somalia")
# Read shapefile
eac_shape <- st_read("EAC_COUNTRIES.shp")
## Reading layer `EAC_COUNTRIES' from data source
## `D:\Data Analysis Projects\CEMA DATA SCIENCE\EAC_COUNTRIES.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 8 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1359718 ymin: -1512113 xmax: 5723082 ymax: 1357225
## Projected CRS: WGS 84 / Pseudo-Mercator
eac_shape$Country <- as.character(eac_shape$NAME)
eac_shape$Country
## [1] "Burundi" "Congo DRC" "Kenya" "Rwanda" "Tanzania"
## [6] "South Sudan" "Uganda" "Somalia"
# Read shapefile and standardize names
eac_shape <- st_read("EAC_COUNTRIES.shp") %>%
mutate(Country = case_when(
NAME == "Democratic Republic of the Congo" ~ "Congo DRC",
NAME == "United Republic of Tanzania" ~ "Tanzania",
TRUE ~ NAME
))
## Reading layer `EAC_COUNTRIES' from data source
## `D:\Data Analysis Projects\CEMA DATA SCIENCE\EAC_COUNTRIES.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 8 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1359718 ymin: -1512113 xmax: 5723082 ymax: 1357225
## Projected CRS: WGS 84 / Pseudo-Mercator
#Reading csv data
#neonatal data
neonatal <- read.csv("Neonatal Mortality Rate.csv") %>%
# Standardize country names
mutate(
Geographic.area = case_when(
Geographic.area %in% c("Democratic Republic of the Congo", "Congo, Dem. Rep.") ~ "Congo DRC",
Geographic.area %in% c("United Republic of Tanzania", "Tanzania, United Rep.") ~ "Tanzania",
TRUE ~ Geographic.area
)
) %>%
# Filter to EAC countries
filter(Geographic.area %in% eac_countries)
#under 5 data
under5 <- read.csv("Under Five Mortality Rate.csv") %>%
# Standardize names in one pipeline
mutate(
Geographic.area = case_when(
Geographic.area %in% c("Democratic Republic of the Congo", "Congo, Dem. Rep.") ~ "Congo DRC",
Geographic.area %in% c("United Republic of Tanzania", "Tanzania, United Rep.") ~ "Tanzania",
TRUE ~ Geographic.area
)
) %>%
# Filter to EAC countries
filter(Geographic.area %in% eac_countries)
#renaming the data
neonatal_latest <- prepare_data(neonatal) %>%
rename(NeonatalRate = OBS_VALUE,
Country = Geographic.area) # Rename column
under5_latest <- prepare_data(under5) %>%
rename(Under5Rate = OBS_VALUE,
Country = Geographic.area) # Rename column
# Check names in mortality data
cat("Neonatal data countries:", unique(neonatal_latest$Country), "\n")
## Neonatal data countries: Kenya Uganda Tanzania South Sudan Somalia Burundi Rwanda Congo DRC
cat("Under5 data countries:", unique(under5_latest$Country), "\n")
## Under5 data countries: Kenya Uganda Tanzania South Sudan Somalia Burundi Rwanda Congo DRC
# Check names in shapefile
cat("Shapefile countries:", unique(eac_shape$Country), "\n")
## Shapefile countries: Burundi Congo DRC Kenya Rwanda Tanzania South Sudan Uganda Somalia
# Merge shapefile with data
merge_shape <- function(shape, data) {
merged <- shape %>%
left_join(data, by = "Country") # Maintain geometry order
return(merged)
}
# Add centroids and labels to the shapefiles
add_centroids <- function(df, rate_col) {
centroids <- st_centroid(df) %>% # Calculate from merged geometry
st_coordinates() %>%
as.data.frame() %>%
rename(lon = X, lat = Y)
df <- cbind(df, centroids)
df$Label <- ifelse(
is.na(df[[rate_col]]),
paste0(df$Country, "\n(Data Missing)"),
as.character(df$Country)
)
return(df)
}
# Apply corrected workflow
neonatal_map <- merge_shape(eac_shape, neonatal_latest) %>%
add_centroids(rate_col = "NeonatalRate")
## Warning: st_centroid assumes attributes are constant over geometries
under5_map <- merge_shape(eac_shape, under5_latest) %>%
add_centroids(rate_col = "Under5Rate")
## Warning: st_centroid assumes attributes are constant over geometries
In order to show latest estimates for each indicator across geographical regions(EAC countries), a choropleth map is required.It shows various of ranges of colors with the colors representing the data(estimates). In the legend for under-5 mortality rate below color ranges from blue-yellow representing estimates with Uganda,Tanzania being on the lower scales and South Sudan and Somalia being high on the scale according to the visualization.
Almost similar results are also displayed for neonatal mortality rate visualization.
# Generate choropleth maps
plot_choropleth <- function(df, rate_var, title) {
ggplot(df) +
geom_sf(aes(fill = .data[[rate_var]]), color = "white") +
geom_text(aes(x = lon, y = lat, label = Label), size = 3.5, color = "black") +
scale_fill_viridis_c(
name = "Deaths per 1,000",
option = "C",
na.value = "grey90",
limits = range(c(df[[rate_var]], na.rm = TRUE))
) +
labs(title = title) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
legend.position = "right"
)
}
plot_choropleth(under5_map, "Under5Rate", "Under-Five Mortality Rate in EAC (2023)")
plot_choropleth(neonatal_map, "NeonatalRate", "Neonatal Mortality Rate in EAC (2023)")
# Prepare trend data
prepare_trends <- function(df) {
df %>%
mutate(Year = as.numeric(substr(TIME_PERIOD, 1, 4))) %>%
filter(Geographic.area %in% eac_shape$Country) %>%
group_by(Year) %>%
mutate(AvgRate = mean(OBS_VALUE, na.rm = TRUE)) %>%
ungroup()
}
neonatal_trend <- prepare_trends(neonatal)
under5_trend <- prepare_trends(under5)
The following are the line trends for neonatal and under-5 mortality over time. with different color dots in the trend lines representing different countries in East Africa.
# Plot mortality trends
plot_trends <- function(df, title) {
ggplot(df, aes(x = Year)) +
geom_line(aes(y = AvgRate), color = "#2c7bb6", size = 1.5, linetype = "dashed") +
geom_point(aes(y = OBS_VALUE, color = Geographic.area), alpha = 0.8, size = 2.5) +
scale_color_viridis_d(name = "Country") +
labs(
title = title,
y = "Deaths per 1,000 live births",
x = "Year"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.position = "bottom",
panel.grid.minor = element_blank()
)
}
plot_trends(neonatal_trend, "Neonatal Mortality Trends in EAC")
plot_trends(under5_trend, "Under-Five Mortality Trends in EAC")
To find the country with the highest under 5 mortality rate in East Africa, the latest data was arranged in descending order. The estimates were arranged from the country with the highest under 5 mortality rate to that with the lowest mortality rate.We can observe that the under 5 mortality ranges from 38.81 to 104.02 deaths per 1000 live births. From the results, Somalia has the highest mortality rate of 104.02 deaths/1000 live births.
# Identify country with highest under_5 mortality rate
cat("Country with Highest Under-5 Mortality:\n")
## Country with Highest Under-5 Mortality:
under5_latest %>%
arrange(desc(Under5Rate)) %>%
select(Country, Under5Rate, Year)
## # A tibble: 8 × 3
## Country Under5Rate Year
## <chr> <dbl> <dbl>
## 1 Somalia 104. 2023
## 2 South Sudan 98.7 2023
## 3 Congo DRC 73.2 2023
## 4 Burundi 49.2 2023
## 5 Rwanda 40.0 2023
## 6 Kenya 39.9 2023
## 7 Tanzania 38.9 2023
## 8 Uganda 38.8 2023
Similarly for the neonatal mortality rate, the results ranges from 17.85 to 40.24 deaths per 1000 live births with South Sudan exhibiting highest neonatal mortality rate of 40.24 deaths/1000 live births in East Africa.
cat("\nCountry with Highest Neonatal Mortality:\n")
##
## Country with Highest Neonatal Mortality:
neonatal_latest %>%
arrange(desc(NeonatalRate)) %>%
select(Country, NeonatalRate, Year)
## # A tibble: 8 × 3
## Country NeonatalRate Year
## <chr> <dbl> <dbl>
## 1 South Sudan 40.2 2023
## 2 Somalia 34.9 2023
## 3 Congo DRC 25.3 2023
## 4 Kenya 21.5 2023
## 5 Tanzania 20.6 2023
## 6 Burundi 19.6 2023
## 7 Rwanda 18.1 2023
## 8 Uganda 17.9 2023