library(tidyverse)
library(ggplot2)
library(lubridate) #dealing with dates
library(broom)
library(dplyr)
library(stringr)
library(scales)
library(patchwork)
library(tseries)
library(forecast)
library(kableExtra)
library(readxl)
library(purrr)
library(stringi)
library(imputeTS)
library(data.table)
library(writexl)
library(sf)
library(gstat)
library(corrr)
library(lubridate)
library(car)
library(grid)
library(terra)Climatic based Malaria Incidence prediction in Malawi
Project Description
Malaria remains one of the most persistent public health challenges in Malawi. Every year, thousands of people, especially children and pregnant women are at risk of this deadly disease. But what if we could predict malaria outbreaks before they happen? That’s exactly what our latest research aims to do, using the power of climate data and predictive modeling.
Data Cleaning & Analysis
Importing Libraries
Loading libraries into our work space which will be used for analysis purposes
Loading in Malaria Data sets and merging the files
files <- list.files(pattern="*.xls")
merged_df <- files %>%
map_dfr(read_excel)
merged_df <- merged_df[ , !(names(merged_df) %in%
c("orgunitlevel1", "orgunitlevel2", "orgunitlevel4", "NMCP IPD Confirmed Malaria Cases", "NMCP IPD Total Malaria Deaths"))]
dates_merged_df <- separate(merged_df, col=periodname, into=c("Month", "Year"), sep=" ")
dates_merged_df <- separate(dates_merged_df, col =orgunitlevel3, into = c("District", "facility"), sep = "-" )Renaming the NMCP OPD Confirmed Malaria Cases to remove spaces
v1_merged_df <- dates_merged_df %>%
rename("Incidence" = "NMCP OPD Confirmed Malaria Cases",
"cluster" = "organisationunitname")getting only specific years from 2018 to 2024
summarized_df <- v1_merged_df %>%
filter(Year >= 2018, Year <= 2024)Importing climate data into work space
lines <- readLines("Windspeed-2010-2025-Monthly.csv")
begin_data <- grep("-END HEADER-", lines)
wind_df <- read_csv("Windspeed-2010-2025-Monthly.csv", skip = begin_data)
temp_df <-read_csv("Temperature-2010-2025-Monthly.csv", skip = begin_data)
rain_df <-read_csv("Rainfall-Monthly_2010_2025.csv", skip = begin_data)
hum_df <-read_csv("RHum-2010-2025-Monthly.csv")Changing all column names to lower case and Filtering period of analysis in climate data from 2018-2024
#writing it the Malaria data as csv and viewing resaving it
write.csv(summarized_df, "malaria.csv", row.names = FALSE)
malaria_df <- read_csv("malaria.csv")
glimpse(malaria_df)Rows: 63,654
Columns: 6
$ District <chr> "Mangochi", "Mangochi", "Mangochi", "Mangochi", "Mangochi", …
$ facility <chr> "DHO", "DHO", "DHO", "DHO", "DHO", "DHO", "DHO", "DHO", "DHO…
$ cluster <chr> "Aa-salam Clinic", "Aa-salam Clinic", "Aa-salam Clinic", "Aa…
$ Month <chr> "February", "March", "April", "May", "June", "July", "August…
$ Year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
$ Incidence <dbl> 451, 794, 308, 276, 291, 170, 133, 155, 128, 59, 62, NA, NA,…
names(malaria_df) <- tolower(names(malaria_df))
names(wind_df) <- tolower(names(wind_df))
names(rain_df) <- tolower(names(rain_df))
names(hum_df) <- tolower(names(hum_df))
names(temp_df) <- tolower(names(temp_df))
str(rain_df$year) num [1:1280] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
wind_df <- wind_df %>%
filter(year >= 2018, year <= 2024)
rain_df <- rain_df %>%
filter(year >= 2018, year <= 2024)
hum_df <- hum_df %>%
filter(year >= 2018, year <= 2024)
temp_df <- temp_df %>%
filter(year >= 2018, year <= 2024)Merging auxiliary facilities/clusters to parent facilities
malaria_df <- malaria_df %>%
mutate(district = case_when(
district == "Queen Elizabeth Central Hospital" ~ "Blantyre",
district == "Zomba Mental Hospital" ~ "Zomba",
district == "Zomba Central Hospital" ~ "Zomba",
district == "Nkhata" ~ "Nkhatabay",
district == "Mzuzu Central Hospital" ~ "Mzuzu",
TRUE ~ district #keeping all other names unchanged
))
#names(malaria_df)Putting in a level factor so that months are ordered during plotting
malaria_df$month <- factor(malaria_df$month,
levels = c("January","February","March","April","May","June",
"July","August","September","October","November","December"))#Summary of values in the malaria dataframe
malaria_df %>%
summarise(
min_incidence = min(incidence, na.rm = TRUE),
max_incidence = max(incidence, na.rm = TRUE),
mean_incidence = mean(incidence, na.rm = TRUE),
sd_incidence = sd(incidence, na.rm = TRUE)
)# A tibble: 1 × 4
min_incidence max_incidence mean_incidence sd_incidence
<dbl> <dbl> <dbl> <dbl>
1 0 137136 1249. 4316.
LOADING POPULATION DATA
# Load population rasters
tif_files <- list.files("population/", pattern = "\\.tif$", full.names = TRUE)
pop_rasters <- lapply(tif_files, terra::rast)
# Extract year from filename
years <- gsub("[^0-9]", "", basename(tif_files)) |> as.numeric()
# Load Malawi districts
districts <- terra::vect("mwi_adm_nso_hotosm_20230405_shp/mwi_admbnda_adm2_nso_hotosm_20230405.shp")
# Reproject districts to match raster CRS
districts <- project(districts, pop_rasters[[1]])
# Identify correct district column
names(districts) # LOOK HERE [1] "ADM2_EN" "ADM2_PCODE" "ADM1_EN" "ADM1_PCODE" "ADM0_EN"
[6] "ADM0_PCODE" "date" "validOn" "validTo" "Shape_Leng"
[11] "Shape_Area"
district_col <- "ADM2_EN" # change based on actual name
# Extract population for each raster
district_pop_list <- lapply(seq_along(pop_rasters), function(i) {
r <- pop_rasters[[i]]
yr <- years[i]
vals <- terra::extract(r, districts, fun = sum, na.rm = TRUE)
data.frame(
district = districts[[district_col]],
year = yr,
population = vals[,2]
)
})
district_pop_all <- dplyr::bind_rows(district_pop_list)Creating an incidence per 1000 column
#changing name of population data per district
district_df <- district_pop_all %>%
rename(district = ADM2_EN)
malaria_with_pop <- malaria_df %>%
left_join(district_df, by = c("district", "year"))
# 1. Create incidence per 1,000
malaria_with_pop <- malaria_with_pop %>%
mutate(incidence_per_1k = (incidence / population) * 1000)
#view(malaria_with_pop)Plotting incidence from 2018-2024
#Lineplot showing highest points of incidence
malaria_with_pop$year <- as.numeric(as.character(malaria_with_pop$year))
malaria_max <- malaria_with_pop %>%
group_by(year) %>%
summarise(
max_incidence = max(incidence, na.rm = TRUE),
.groups = "drop"
)
ggplot(malaria_max, aes(x = year, y = max_incidence)) +
geom_line(linewidth = 1.2, color = "darkred") +
geom_point(size = 2, color = "darkred") +
theme_minimal() +
labs(
title = "Malaria Incidence by Year 2018-2024",
x = "Year",
y = "Incidence"
)# -----------------------------
# 1. Filter data for 2018–2024
# -----------------------------
malaria_with_pop <- malaria_with_pop %>%
mutate(year = as.numeric(year)) %>%
filter(year >= 2018 & year <= 2024)
# -----------------------------
# 2. Convert months to numbers and create date
# -----------------------------
malaria_with_pop <- malaria_with_pop %>%
mutate(
month_num = match(tolower(trimws(month)), tolower(month.name)),
date = as.Date(paste(year, month_num, "01", sep = "-"))
)
# -----------------------------
# 3. Compute monthly peak incidence
# -----------------------------
malaria_monthly_max <- malaria_with_pop %>%
group_by(date) %>%
summarise(
max_incidence = max(incidence, na.rm = TRUE),
.groups = "drop"
)
# -----------------------------
# 4. Ensure full monthly timeline (2018–2024)
# -----------------------------
full_dates <- data.frame(
date = seq(as.Date("2018-01-01"), as.Date("2024-12-01"), by = "month")
)
malaria_monthly_max <- full_dates %>%
left_join(malaria_monthly_max, by = "date")
# -----------------------------
# 5. Create labels for months and year
# -----------------------------
malaria_monthly_max <- malaria_monthly_max %>%
mutate(
month_label = format(date, "%b"), # Jan, Feb, ...
year_label = ifelse(format(date, "%m") == "01", format(date, "%Y"), "")
)
# -----------------------------
# 6. Identify positions for vertical grid lines (start of each year)
# -----------------------------
year_start_dates <- malaria_monthly_max$date[format(malaria_monthly_max$date, "%m") == "01"]
# -----------------------------
# 7. Plot
# -----------------------------
ggplot(malaria_monthly_max, aes(x = date, y = max_incidence)) +
geom_line(color = "darkred", linewidth = 1.2, na.rm = TRUE) +
geom_point(color = "darkred", size = 1.8, na.rm = TRUE) +
# vertical lines to mark start of each year
geom_vline(xintercept = as.numeric(year_start_dates), color = "grey50", linetype = "dashed") +
# x-axis formatting
scale_x_date(
breaks = malaria_monthly_max$date,
labels = malaria_monthly_max$month_label,
limits = c(as.Date("2018-01-01"), as.Date("2024-12-01"))
) +
# Add year labels as horizontal text above plot
annotate("text",
x = year_start_dates,
y = max(malaria_monthly_max$max_incidence, na.rm = TRUE) * 1.05,
label = format(year_start_dates, "%Y"),
vjust = 0,
hjust = 0.5,
size = 3.5) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 60, hjust = 1, size = 5),
panel.grid.minor.x = element_blank()
) +
labs(
title = "Malaria Incidence (2018–2024)",
x = "Month",
y = "Incidence"
) +
expand_limits(y = max(malaria_monthly_max$max_incidence, na.rm = TRUE) * 1.1) # make room for year labels# Ensure month is ordered
malaria_with_pop$month <- factor(malaria_with_pop$month,
levels = c("January","February","March","April","May","June",
"July","August","September","October","November","December"))#checking districts with highest incidence values:
monthly_year_district <- malaria_with_pop %>%
group_by(district, year, month) %>%
summarise(Total_Incidence = sum(incidence_per_1k, na.rm = TRUE), .groups = "drop")
# Heatmap faceted by district
ggplot(monthly_year_district, aes(x = month, y = factor(year), fill = Total_Incidence)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightyellow", high = "steelblue") +
facet_wrap(~district, scales = "free_y") +
labs(title = "Monthly Incidence per District per 1,000 (2018–2024)",
x = "Month",
y = "Year",
fill = "Incidence") +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 10, face = "bold"),
legend.title = element_text(size = 10),
legend.text = element_text(size = 12),
strip.text = element_text(face = "bold", size = 8),
plot.title = element_text(face = "bold", size = 10, hjust = 0.5)
)#view(hum_df)
#removing the column annual from the climate datasets
temp_df <- temp_df[ , !(names(temp_df) %in% c("ann"))]
rain_df <- rain_df[ , !(names(rain_df) %in% c("ann"))]
hum_df <- hum_df[ , !(names(hum_df) %in% c("ann"))]
wind_df <- wind_df[ , !(names(wind_df) %in% c("ann"))]
hum_df <- hum_df %>%
rename("parameter" = "parameter-hum")unique(malaria_with_pop$district) [1] "Mangochi" "Lilongwe" "Nkhotakota" "Chikwawa" "Thyolo"
[6] "Kasungu" "Karonga" "Balaka" "Blantyre" "Dedza"
[11] "Chiradzulu" "Ntcheu" "Zomba" "Rumphi" "Mulanje"
[16] "Dowa" "Nkhatabay" "Mzimba" "Salima" "Machinga"
[21] "Chitipa" "Nsanje" "Neno" "Mchinji" "Ntchisi"
[26] "Phalombe" "Likoma" "Mwanza" "Mzuzu"
#converting climate datasets to data.table
setDT(rain_df)
setDT(wind_df)
setDT(temp_df)
setDT(hum_df)
#reshaping to long format using melt
months <- c("jan","feb","mar","apr","may","jun",
"jul","aug","sep","oct","nov","dec")
rain_long <- melt(rain_df,
id.vars = c("year","lat","lon"),
measure.vars = months,
variable.name = "month",
value.name = "rainfall")
hum_long <- melt(hum_df,
id.vars = c("year","lat","lon"),
measure.vars = months,
variable.name = "month",
value.name = "humidity")
temp_long <- melt(temp_df,
id.vars = c("year","lat","lon"),
measure.vars = months,
variable.name = "month",
value.name = "temperature")
wind_long <- melt(wind_df,
id.vars = c("year","lat","lon"),
measure.vars = months,
variable.name = "month",
value.name = "windspeed")#converting month names to number and creating a date column
climate_list <- list(rain_long, hum_long, temp_long, wind_long)
for(i in seq_along(climate_list)){
climate_list[[i]][, month_num := match(tolower(month), tolower(month.abb))]
climate_list[[i]][, date := as.Date(paste(year, month_num, "01", sep = "-"))]
}
# Assign back
rain_long <- climate_list[[1]]
hum_long <- climate_list[[2]]
temp_long <- climate_list[[3]]
wind_long <- climate_list[[4]]
#keeping only necessary columns
rain_long <- rain_long[, .(year, lat, lon, month, date, rainfall)]
hum_long <- hum_long[, .(year, lat, lon, month, date, humidity)]
temp_long <- temp_long[, .(year, lat, lon, month, date, temperature)]
wind_long <- wind_long[, .(year, lat, lon, month, date, windspeed)]
#merging climate datasets
climate_df <- merge(rain_long, hum_long,
by = c("year","lat","lon","month","date"), all = TRUE)
climate_df <- merge(climate_df, temp_long,
by = c("year","lat","lon","month","date"), all = TRUE)
climate_df <- merge(climate_df, wind_long,
by = c("year","lat","lon","month","date"), all = TRUE)
head(climate_df)Key: <year, lat, lon, month, date>
year lat lon month date rainfall humidity temperature windspeed
<num> <num> <num> <fctr> <Date> <num> <num> <num> <num>
1: 2018 -17 33.125 jan 2018-01-01 0.62 58.22 27.42 3.02
2: 2018 -17 33.125 feb 2018-02-01 11.33 83.60 24.88 2.08
3: 2018 -17 33.125 mar 2018-03-01 3.78 78.75 24.98 2.39
4: 2018 -17 33.125 apr 2018-04-01 0.81 70.54 23.97 2.83
5: 2018 -17 33.125 may 2018-05-01 0.24 62.94 23.01 2.64
6: 2018 -17 33.125 jun 2018-06-01 0.01 57.74 20.15 2.77
Plotting climate
# -----------------------------
# 1. Convert to long format
# -----------------------------
climate_long <- climate_df %>%
select(date, rainfall, humidity, temperature, windspeed) %>%
pivot_longer(
cols = c(rainfall, humidity, temperature, windspeed),
names_to = "variable",
values_to = "value"
)
# -----------------------------
# 2. Ensure full monthly timeline
# -----------------------------
full_dates <- expand.grid(
date = seq(as.Date("2018-01-01"), as.Date("2024-12-01"), by = "month"),
variable = unique(climate_long$variable)
)
climate_long <- full_dates %>%
left_join(climate_long, by = c("date", "variable"))
# -----------------------------
# 3. Month and year labels
# -----------------------------
climate_long <- climate_long %>%
mutate(
month_label = format(date, "%b"),
year_start = format(date, "%m") == "01"
)
year_start_dates <- unique(climate_long$date[climate_long$year_start])
# -----------------------------
# 4. Plot: lines + months + years
# -----------------------------
ggplot(climate_long, aes(x = date, y = value, group = variable)) +
# Line plot
geom_line(color = "steelblue", linewidth = 0.9, na.rm = TRUE) +
# Vertical dashed lines at start of each year
geom_vline(
xintercept = as.numeric(year_start_dates),
color = "grey70",
linetype = "dashed",
size = 0.5
) +
# X-axis: monthly ticks
scale_x_date(
date_labels = "%b",
date_breaks = "1 month",
limits = c(as.Date("2018-01-01"), as.Date("2024-12-01"))
) +
# Facet per variable
facet_wrap(~ variable, scales = "free_y", ncol = 1) +
# Year labels above facets
geom_text(
data = data.frame(date = year_start_dates),
aes(
x = date,
y = max(climate_long$value, na.rm = TRUE) * 1.05,
label = format(date, "%Y")
),
inherit.aes = FALSE,
vjust = 0,
hjust = 0.5,
size = 3.5,
color = "grey30"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 60, hjust = 1, size = 6),
panel.grid.minor.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.margin = margin(10, 10, 20, 10)
) +
labs(
title = "Monthly Climatic Trends (2018–2024)",
x = "Month",
y = "Value"
)#Geolocating points in the climate data and linking it to distrits for merging with the malaria data
# Load Malawi districts shapefile
districts <- st_read("mwi_adm_nso_hotosm_20230405_shp/mwi_admbnda_adm2_nso_hotosm_20230405.shp")Reading layer `mwi_admbnda_adm2_nso_hotosm_20230405' from data source
`/Users/m1/Downloads/Research/Malaria-Incidence-Prediction/mwi_adm_nso_hotosm_20230405_shp/mwi_admbnda_adm2_nso_hotosm_20230405.shp'
using driver `ESRI Shapefile'
Simple feature collection with 32 features and 11 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 32.67164 ymin: -17.12975 xmax: 35.91848 ymax: -9.367346
Geodetic CRS: WGS 84
# Transform to same CRS as your climate points
districts <- st_transform(districts, crs = 4326) # WGS84
# 2. Convert climate dataframe to sf
# Replace 'climate_df_raw' with your actual dataframe
climate_sf <- st_as_sf(climate_df, coords = c("lon", "lat"), crs = 4326)
# 3. Spatial join with districts (buffer + nearest fallback)
# First, buffer districts slightly to catch edge points
malawi_buffered <- st_buffer(districts, dist = 0.01)
# Initial join using buffer
points_with_district <- st_join(climate_sf, malawi_buffered["ADM2_EN"], join = st_within)
# Handle points still NA (nearest district)
na_points <- points_with_district[is.na(points_with_district$ADM2_EN), ]
nearest_index <- st_nearest_feature(na_points, districts)
na_points$ADM2_EN <- districts$ADM2_EN[nearest_index]
points_with_district$ADM2_EN[is.na(points_with_district$ADM2_EN)] <- na_points$ADM2_EN
# 4. Clean dataframe
colnames(points_with_district)[1] "year" "month" "date" "rainfall" "humidity"
[6] "temperature" "windspeed" "ADM2_EN" "geometry"
head(points_with_district)Simple feature collection with 6 features and 8 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 33.125 ymin: -17 xmax: 33.125 ymax: -17
Geodetic CRS: WGS 84
year month date rainfall humidity temperature windspeed ADM2_EN
1 2018 jan 2018-01-01 0.62 58.22 27.42 3.02 Chikwawa
2 2018 feb 2018-02-01 11.33 83.60 24.88 2.08 Chikwawa
3 2018 mar 2018-03-01 3.78 78.75 24.98 2.39 Chikwawa
4 2018 apr 2018-04-01 0.81 70.54 23.97 2.83 Chikwawa
5 2018 may 2018-05-01 0.24 62.94 23.01 2.64 Chikwawa
6 2018 jun 2018-06-01 0.01 57.74 20.15 2.77 Chikwawa
geometry
1 POINT (33.125 -17)
2 POINT (33.125 -17)
3 POINT (33.125 -17)
4 POINT (33.125 -17)
5 POINT (33.125 -17)
6 POINT (33.125 -17)
district_df <- district_pop_all %>%
rename(district = ADM2_EN)
head(district_df) district year population
1 Chitipa 2018 244071.83
2 Karonga 2018 379911.26
3 Nkhatabay 2018 295123.36
4 Rumphi 2018 238295.04
5 Mzimba 2018 976468.82
6 Likoma 2018 14986.53
view(district_df)# 2. Ensure month is ordered
malaria_with_pop$month <- factor(malaria_with_pop$month, levels = month.name)
# 3. Summarize incidence per district per month
monthly_district_norm <- malaria_with_pop %>%
group_by(district, month) %>%
summarise(Incidence_per_1k = sum(incidence_per_1k, na.rm = TRUE),
.groups = "drop")
# 4. Plot heatmap
ggplot(monthly_district_norm, aes(x = month, y = district, fill = Incidence_per_1k)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightyellow", high = "steelblue", name = "Incidence\n(per 1,000)") +
labs(title = "Monthly Malaria Incidence per District (per 1,000 people)",
x = "Month",
y = "District") +
theme_minimal(base_size = 12) +
theme(
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5)
)climate_df <- points_with_district %>%
st_drop_geometry() %>%
select(year, month, date, rainfall, humidity, temperature, windspeed, ADM2_EN) %>%
rename(district = ADM2_EN)
# Check results
head(climate_df) year month date rainfall humidity temperature windspeed district
1 2018 jan 2018-01-01 0.62 58.22 27.42 3.02 Chikwawa
2 2018 feb 2018-02-01 11.33 83.60 24.88 2.08 Chikwawa
3 2018 mar 2018-03-01 3.78 78.75 24.98 2.39 Chikwawa
4 2018 apr 2018-04-01 0.81 70.54 23.97 2.83 Chikwawa
5 2018 may 2018-05-01 0.24 62.94 23.01 2.64 Chikwawa
6 2018 jun 2018-06-01 0.01 57.74 20.15 2.77 Chikwawa
unique(climate_df$district) [1] "Chikwawa" "Nsanje" "Thyolo" "Mulanje"
[5] "Dedza" "Mwanza" "Blantyre" "Zomba"
[9] "Lilongwe" "Ntcheu" "Balaka" "Machinga"
[13] "Mangochi" "Mchinji" "Lilongwe City" "Dowa"
[17] "Salima" "Kasungu" "Nkhotakota" "Mzimba"
[21] "Likoma" "Nkhatabay" "Rumphi" "Chitipa"
[25] "Karonga"
# 5. Plot Malawi map with district labels
districts_sf <- st_as_sf(districts)
# Plot
ggplot() +
geom_sf(data = districts_sf, fill = NA, color = "black") +
geom_sf_text(data = districts_sf, aes(label = ADM2_EN), size = 2, color = "blue") +
theme_minimal() +
labs(title = "Malawi Districts", caption = "Source: Malawi shapefile")##Facets for climate per district
#unique(climate_df$district)
# -----------------------------
# 1. Clean district names
# -----------------------------
climate_df <- climate_df %>%
mutate(
district = case_when(
tolower(district) %in% c("lilongwe city", "lilongwe city council") ~ "Lilongwe",
TRUE ~ district
)
)
# -----------------------------
# 2. Convert to long format
# -----------------------------
climate_long <- climate_df %>%
select(district, date, rainfall, humidity, temperature, windspeed) %>%
pivot_longer(
cols = c(rainfall, humidity, temperature, windspeed),
names_to = "variable",
values_to = "value"
)
# -----------------------------
# 3. Ensure full monthly timeline per district
# -----------------------------
full_dates <- expand.grid(
date = seq(as.Date("2018-01-01"), as.Date("2024-12-01"), by = "month"),
district = unique(climate_long$district)
)
climate_long <- full_dates %>%
left_join(climate_long, by = c("date", "district"))
# -----------------------------
# 4. Month label and year start dates
# -----------------------------
climate_long <- climate_long %>%
mutate(month_label = format(date, "%b"))
year_start_dates <- unique(climate_long$date[format(climate_long$date, "%m") == "01"])
# -----------------------------
# 5. Create output directory
# -----------------------------
base_dir <- "climate_plots_by_district"
dir.create(base_dir, showWarnings = FALSE)
# -----------------------------
# 6. Loop per district
# -----------------------------
for (d in unique(climate_long$district)) {
district_dir <- file.path(base_dir, d)
dir.create(district_dir, showWarnings = FALSE)
district_data <- filter(climate_long, district == d)
p <- ggplot(district_data, aes(x = date, y = value, group = variable)) +
# Smoothed peak trend
geom_smooth(
method = "loess",
span = 0.25,
se = FALSE,
color = "steelblue",
linewidth = 1.1,
na.rm = TRUE
) +
# Vertical dashed lines at the start of each year
geom_vline(
xintercept = as.numeric(year_start_dates),
color = "grey70",
linetype = "dashed",
size = 0.5
) +
# X-axis: monthly breaks, labels as month abbreviations
scale_x_date(
date_labels = "%b",
date_breaks = "1 month",
limits = c(as.Date("2018-01-01"), as.Date("2024-12-01"))
) +
# Facet by variable
facet_wrap(~ variable, scales = "free_y", ncol = 1) +
# Year labels above plot
geom_text(
data = data.frame(date = year_start_dates),
aes(x = date, y = max(district_data$value, na.rm = TRUE) * 1.05,
label = format(date, "%Y")),
inherit.aes = FALSE,
vjust = 0,
hjust = 0.5,
size = 3.5,
color = "grey30"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 60, hjust = 1, size = 7),
axis.text.y = element_text(size = 7),
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold", size = 12),
panel.spacing = unit(1.6, "lines"),
plot.margin = margin(20, 20, 30, 20)
) +
labs(
title = paste("Climatic Trends in", d, "(2018–2024)"),
x = "Month",
y = "Value"
)
# Save to file
ggsave(
filename = file.path(district_dir, paste0("climate_trends_", d, ".png")),
plot = p,
width = 12,
height = 16,
units = "in",
dpi = 300
)
}#merging climate with malaria data
setDT(malaria_with_pop)
head(malaria_with_pop) district facility cluster month year incidence population
<char> <char> <char> <fctr> <num> <num> <num>
1: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
2: Mangochi DHO Aa-salam Clinic March 2019 794 1228651
3: Mangochi DHO Aa-salam Clinic April 2019 308 1228651
4: Mangochi DHO Aa-salam Clinic May 2019 276 1228651
5: Mangochi DHO Aa-salam Clinic June 2019 291 1228651
6: Mangochi DHO Aa-salam Clinic July 2019 170 1228651
incidence_per_1k month_num date
<num> <int> <Date>
1: 0.3670691 2 2019-02-01
2: 0.6462370 3 2019-03-01
3: 0.2506814 4 2019-04-01
4: 0.2246365 5 2019-05-01
5: 0.2368451 6 2019-06-01
6: 0.1383631 7 2019-07-01
head(malaria_with_pop) district facility cluster month year incidence population
<char> <char> <char> <fctr> <num> <num> <num>
1: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
2: Mangochi DHO Aa-salam Clinic March 2019 794 1228651
3: Mangochi DHO Aa-salam Clinic April 2019 308 1228651
4: Mangochi DHO Aa-salam Clinic May 2019 276 1228651
5: Mangochi DHO Aa-salam Clinic June 2019 291 1228651
6: Mangochi DHO Aa-salam Clinic July 2019 170 1228651
incidence_per_1k month_num date
<num> <int> <Date>
1: 0.3670691 2 2019-02-01
2: 0.6462370 3 2019-03-01
3: 0.2506814 4 2019-04-01
4: 0.2246365 5 2019-05-01
5: 0.2368451 6 2019-06-01
6: 0.1383631 7 2019-07-01
#merging the climate and malaria data
# Make sure the date columns are Date type
malaria_with_pop$date <- as.Date(malaria_with_pop$date)
climate_df$date <- as.Date(climate_df$date)
climate_df <- climate_df %>%
mutate(month = month.name[match(tolower(month), tolower(month.abb))])
malaria_with_pop <- malaria_with_pop %>% mutate(year = as.numeric(as.character(year)))
climate_df <- climate_df %>% mutate(year = as.numeric(as.character(year)))
head(climate_df) year month date rainfall humidity temperature windspeed district
1 2018 January 2018-01-01 0.62 58.22 27.42 3.02 Chikwawa
2 2018 February 2018-02-01 11.33 83.60 24.88 2.08 Chikwawa
3 2018 March 2018-03-01 3.78 78.75 24.98 2.39 Chikwawa
4 2018 April 2018-04-01 0.81 70.54 23.97 2.83 Chikwawa
5 2018 May 2018-05-01 0.24 62.94 23.01 2.64 Chikwawa
6 2018 June 2018-06-01 0.01 57.74 20.15 2.77 Chikwawa
# Join datasets by district and date
merged_df <- malaria_with_pop %>%
left_join(climate_df, by = c("district", "date", "year","month"))
# Checking result of merge
head(merged_df) district facility cluster month year incidence population
<char> <char> <char> <char> <num> <num> <num>
1: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
2: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
3: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
4: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
5: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
6: Mangochi DHO Aa-salam Clinic February 2019 451 1228651
incidence_per_1k month_num date rainfall humidity temperature
<num> <int> <Date> <num> <num> <num>
1: 0.3670691 2 2019-02-01 7.04 82.88 24.38
2: 0.3670691 2 2019-02-01 5.01 86.72 22.87
3: 0.3670691 2 2019-02-01 8.42 76.47 25.42
4: 0.3670691 2 2019-02-01 6.90 87.05 22.56
5: 0.3670691 2 2019-02-01 9.40 79.18 24.44
6: 0.3670691 2 2019-02-01 8.85 87.16 22.31
windspeed
<num>
1: 1.75
2: 1.74
3: 2.67
4: 1.68
5: 2.43
6: 1.59
# Summarize incidence per district per year
yearly_district <- malaria_with_pop %>%
group_by(district, year) %>%
summarise(Total_Incidence = sum(incidence_per_1k, na.rm = TRUE), .groups = "drop")
# Make year a factor for proper ordering
yearly_district$year <- factor(yearly_district$year, levels = 2018:2025)
# Heatmap
ggplot(yearly_district, aes(x = year, y = district, fill = Total_Incidence)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightyellow", high = "steelblue") +
labs(title = "Annual Incidence per District per 1,000 (2018–2024)",
x = "Year",
y = "District",
fill = "Incidence") +
theme_minimal(base_size = 12) +
theme(
axis.title.x = element_text(size = 10, face = "bold"),
axis.title.y = element_text(size = 10, face = "bold"),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 8),
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
plot.title = element_text(face = "bold", size = 10, hjust = 0.5)
)# 1. Convert SpatVector to sf
districts_sf <- st_as_sf(districts)
# 2. Aggregate malaria incidence
malaria_agg <- malaria_with_pop %>%
group_by(district, year) %>%
summarise(total_incidence = sum(incidence_per_1k, na.rm = TRUE)) %>%
ungroup()
# 3. Join with districts
malaria_map_data <- districts_sf %>%
left_join(malaria_agg, by = c("ADM2_EN" = "district")) %>%
filter(!is.na(total_incidence))
# 4. Plot choropleth
ggplot(malaria_map_data) +
geom_sf(aes(fill = total_incidence), color = "black", size = 0.2) +
scale_fill_gradient(low = "#FFF7BC", high = "#D95F0E", name = "Incidence") +
theme_minimal() +
labs(title = "Malaria Incidence per District in Malawi (2018-2024)") +
facet_wrap(~ year) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank()
)#making correlation for incidence with climate variables
# Define climate variables
climate_vars <- c("rainfall", "humidity", "temperature", "windspeed")
# Function to compute correlation and p-value safely
cor_test_safe <- function(x, y) {
if(sum(!is.na(x) & !is.na(y)) > 1) {
test <- cor.test(x, y, use = "complete.obs")
tibble(correlation = test$estimate, p_value = test$p.value)
} else {
tibble(correlation = NA_real_, p_value = NA_real_)
}
}
# Compute correlation and p-value per district and variable
correlation_per_district <- merged_df %>%
group_by(district) %>%
summarise(
across(
all_of(climate_vars),
~ list(cor_test_safe(incidence, .x)),
.names = "{.col}"
),
.groups = "drop"
) %>%
mutate(across(all_of(climate_vars), ~ map_dfr(.x, ~.x))) %>%
unnest(cols = everything(), names_sep = "_")
# Reshape to long format
cor_long <- correlation_per_district %>%
pivot_longer(
cols = matches(paste0("^(", paste(climate_vars, collapse = "|"), ")_")),
names_to = c("climate_variable", ".value"),
names_sep = "_"
) %>%
filter(!is.na(correlation))
# Function to convert p-values to significance stars
p_to_stars <- function(p) {
case_when(
p < 0.001 ~ "***",
p < 0.01 ~ "**",
p < 0.05 ~ "*",
p < 0.1 ~ ".",
TRUE ~ ""
)
}
colnames(cor_long)[1] "district" "climate_variable" "correlation" "p"
cor_long <- cor_long %>%
mutate(stars = p_to_stars(p))
# Plot heatmap with correlation and stars
ggplot(cor_long, aes(x = climate_variable, y = district, fill = correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = stars), color = "black", size = 5) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, name = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Correlation of Malaria Incidence vs Climate Variables per District",
x = "Climate Variable",
y = "District",
subtitle = "Significance indicated by stars: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1"
)#creating month lags for climate variables vs incidence
#create monthly lags 1 - 5
# 1. Define helper functions
# Safe correlation test (returns tibble even if correlation fails)
cor_test_safe <- function(x, y) {
tryCatch({
res <- cor.test(x, y, method = "pearson")
tibble(correlation = unname(res$estimate),
p_value = res$p.value)
}, error = function(e) tibble(correlation = NA, p_value = NA))
}
# Convert p-values to significance stars
p_to_stars <- function(p) {
case_when(
is.na(p) ~ "",
p < 0.001 ~ "***",
p < 0.01 ~ "**",
p < 0.05 ~ "*",
p < 0.1 ~ ".",
TRUE ~ ""
)
}
# 2. Define climate variables
climate_vars <- c("rainfall", "humidity", "temperature", "windspeed")
# 3. Create lagged climate variables (1–5 months)
lagged_df <- merged_df %>%
group_by(district) %>%
arrange(date, .by_group = TRUE) %>%
mutate(across(
all_of(climate_vars),
list(
lag1 = ~lag(.x, 1),
lag2 = ~lag(.x, 2),
lag3 = ~lag(.x, 3),
lag4 = ~lag(.x, 4),
lag5 = ~lag(.x, 5),
lag6 = ~lag(.x, 6)
),
.names = "{.col}_{.fn}"
)) %>%
ungroup()
# 4. Compute correlations for each district and lag
climate_lag_vars <- grep("rainfall|humidity|temperature|windspeed", names(lagged_df), value = TRUE)
correlation_lags <- lagged_df %>%
group_by(district) %>%
summarise(
across(
all_of(climate_lag_vars),
~ list(cor_test_safe(incidence, .x)),
.names = "{.col}"
),
.groups = "drop"
) %>%
mutate(across(all_of(climate_lag_vars), ~ map_dfr(.x, ~.x))) %>%
unnest(cols = everything(), names_sep = "_")
# 5. Reshape to long format for plotting/analysis
cor_long_lags <- correlation_lags %>%
pivot_longer(
cols = matches("_(lag[1-5])_"),
names_to = c("climate_variable", "lag", ".value"),
names_pattern = "(.*)_(lag[1-6])_(.*)"
) %>%
filter(!is.na(correlation)) %>%
mutate(stars = p_to_stars(p_value))
# Add this short line below:
cor_long_lags <- cor_long_lags %>%
mutate(stars = ifelse(stars == "", "none", stars))#correlation heatmaps with district facets
cor_long_lags_heat <- cor_long_lags %>%
mutate(stars = ifelse(stars == "none", "", stars))
ggplot(cor_long_lags_heat,
aes(x = lag, y = climate_variable, fill = correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = stars), color = "black", size = 4, fontface = "bold") +
scale_fill_gradient2(
low = "blue", mid = "white", high = "red",
midpoint = 0,
limits = c(-1, 1),
name = "Correlation"
) +
facet_wrap(~ district, ncol = 4) +
theme_minimal(base_size = 12) +
labs(
title = "Lagged Climate–Malaria Correlation Matrices by District (1–5 Month Lags)",
x = "Lag (months)",
y = "Climate Variable"
) +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
strip.text = element_text(face = "bold")
)#plotting lagged values per year
lagged_df <- lagged_df %>%
mutate(year = year(date))
#getting correlatioms per year
correlation_lags_year <- lagged_df %>%
group_by(district, year) %>%
summarise(
across(
all_of(climate_lag_vars),
~ list(cor_test_safe(incidence, .x)),
.names = "{.col}"
),
.groups = "drop"
) %>%
mutate(across(all_of(climate_lag_vars), ~ map_dfr(.x, ~.x))) %>%
unnest(cols = everything(), names_sep = "_")
#reshaping to long format
cor_long_lags_year <- correlation_lags_year %>%
pivot_longer(
cols = matches("_(lag[1-5])_"),
names_to = c("climate_variable", "lag", ".value"),
names_pattern = "(.*)_(lag[1-5])_(.*)"
) %>%
filter(!is.na(correlation)) %>%
mutate(
stars = p_to_stars(p_value),
stars = ifelse(stars == "none", "", stars),
lag = factor(lag, levels = paste0("lag", 1:5))
)#plotting facets per year
ggplot(cor_long_lags_year,
aes(x = lag, y = climate_variable, fill = correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = stars), color = "black", size = 3, fontface = "bold") +
scale_fill_gradient2(
low = "blue", mid = "white", high = "red",
midpoint = 0,
limits = c(-1, 1),
name = "Correlation"
) +
facet_grid(year ~ district) + # rows = year, columns = district
theme_minimal(base_size = 12) +
labs(
title = "Lagged Climate–Malaria Correlation Matrices by District and Year",
x = "Lag (months)",
y = "Climate Variable"
) +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 10),
strip.text = element_text(face = "bold")
)# PLOT: Lagged Climate–Malaria Correlations by District and Year
ggplot(cor_long_lags_year,
aes(x = lag, y = climate_variable, fill = correlation)) +
geom_tile(color = "white") +
geom_text(aes(label = stars), color = "black", size = 3, fontface = "bold") +
scale_fill_gradient2(
low = "blue", mid = "white", high = "red",
midpoint = 0,
limits = c(-1, 1),
name = "Correlation"
) +
# ---- FIX SQUASHING ----
facet_wrap(~ year + district, ncol = 6, scales = "free") +
theme_minimal(base_size = 12) +
labs(
title = "Lagged Climate–Malaria Correlation Matrices by District and Year",
x = "Lag (months)",
y = "Climate Variable"
) +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 10),
strip.text = element_text(face = "bold", size = 10),
plot.title = element_text(face = "bold", size = 14)
)