Climatic based Malaria Incidence prediction in Malawi

Author

Doreen A. Ndovie, Dr Sureni Wickramasooria, Nicholas S. Adam PhD

Published

January 9, 2026

Project Description

Malaria remains one of the most persistent public health challenges in Malawi. Every year, thousands of people, especially children and pregnant women are at risk of this deadly disease. But what if we could predict malaria outbreaks before they happen? That’s exactly what our latest research aims to do, using the power of climate data and predictive modeling.

Data Cleaning & Analysis

Importing Libraries

Loading libraries into our work space which will be used for analysis purposes

library(tidyverse)
library(ggplot2)
library(lubridate) #dealing with dates
library(broom)
library(dplyr)
library(stringr)
library(scales)
library(patchwork)
library(tseries)
library(forecast)
library(kableExtra)
library(readxl)
library(purrr)
library(stringi)
library(imputeTS)
library(data.table)
library(writexl)
library(sf)
library(gstat)
library(corrr)
library(lubridate)
library(car)
library(grid)
library(terra)

Loading in Malaria Data sets and merging the files

files <- list.files(pattern="*.xls")

merged_df <- files %>% 
  map_dfr(read_excel)

merged_df <- merged_df[ , !(names(merged_df) %in% 
                              c("orgunitlevel1", "orgunitlevel2", "orgunitlevel4", "NMCP IPD Confirmed Malaria Cases", "NMCP IPD Total Malaria Deaths"))]
dates_merged_df <- separate(merged_df, col=periodname, into=c("Month", "Year"), sep=" ")
dates_merged_df <- separate(dates_merged_df, col =orgunitlevel3, into = c("District", "facility"), sep = "-" )

Renaming the NMCP OPD Confirmed Malaria Cases to remove spaces

v1_merged_df <- dates_merged_df %>%
  rename("Incidence" = "NMCP OPD Confirmed Malaria Cases", 
         "cluster" = "organisationunitname")

getting only specific years from 2018 to 2024

summarized_df <- v1_merged_df %>%
  filter(Year >= 2018, Year <= 2024)

Importing climate data into work space

lines <- readLines("Windspeed-2010-2025-Monthly.csv")

begin_data <- grep("-END HEADER-", lines)

wind_df <- read_csv("Windspeed-2010-2025-Monthly.csv", skip = begin_data)

temp_df <-read_csv("Temperature-2010-2025-Monthly.csv", skip = begin_data)

rain_df <-read_csv("Rainfall-Monthly_2010_2025.csv", skip = begin_data)

hum_df <-read_csv("RHum-2010-2025-Monthly.csv")

Changing all column names to lower case and Filtering period of analysis in climate data from 2018-2024

#writing it the Malaria data as csv and viewing resaving it
write.csv(summarized_df, "malaria.csv", row.names = FALSE)
malaria_df <- read_csv("malaria.csv")
glimpse(malaria_df)

Rows: 63,654
Columns: 6
$ District  <chr> "Mangochi", "Mangochi", "Mangochi", "Mangochi", "Mangochi", …
$ facility  <chr> "DHO", "DHO", "DHO", "DHO", "DHO", "DHO", "DHO", "DHO", "DHO…
$ cluster   <chr> "Aa-salam Clinic", "Aa-salam Clinic", "Aa-salam Clinic", "Aa…
$ Month     <chr> "February", "March", "April", "May", "June", "July", "August…
$ Year      <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, …
$ Incidence <dbl> 451, 794, 308, 276, 291, 170, 133, 155, 128, 59, 62, NA, NA,…

names(malaria_df) <- tolower(names(malaria_df))
names(wind_df) <- tolower(names(wind_df))
names(rain_df) <- tolower(names(rain_df))
names(hum_df) <- tolower(names(hum_df))
names(temp_df) <- tolower(names(temp_df))

str(rain_df$year)

 num [1:1280] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...

wind_df <- wind_df %>%
  filter(year >= 2018, year <= 2024)

rain_df <- rain_df %>%
  filter(year >= 2018, year <= 2024)

hum_df <- hum_df %>%
  filter(year >= 2018, year <= 2024)

temp_df <- temp_df %>%
  filter(year >= 2018, year <= 2024)

Merging auxiliary facilities/clusters to parent facilities

malaria_df <- malaria_df %>%
  mutate(district = case_when(
    district == "Queen Elizabeth Central Hospital" ~ "Blantyre",
    district == "Zomba Mental Hospital" ~ "Zomba",
    district == "Zomba Central Hospital" ~ "Zomba",
    district == "Nkhata" ~ "Nkhatabay",
    district == "Mzuzu Central Hospital" ~ "Mzuzu",
    TRUE ~ district #keeping all other names unchanged
  ))

#names(malaria_df)

Putting in a level factor so that months are ordered during plotting

malaria_df$month <- factor(malaria_df$month,
                           levels = c("January","February","March","April","May","June",
                                      "July","August","September","October","November","December"))

#Summary of values in the malaria dataframe
malaria_df %>% 
  summarise(
    min_incidence  = min(incidence, na.rm = TRUE),
    max_incidence  = max(incidence, na.rm = TRUE),
    mean_incidence = mean(incidence, na.rm = TRUE),
    sd_incidence   = sd(incidence, na.rm = TRUE)
  )

# A tibble: 1 × 4
  min_incidence max_incidence mean_incidence sd_incidence
          <dbl>         <dbl>          <dbl>        <dbl>
1             0        137136          1249.        4316.

LOADING POPULATION DATA

# Load population rasters

tif_files <- list.files("population/", pattern = "\\.tif$", full.names = TRUE)
pop_rasters <- lapply(tif_files, terra::rast)

# Extract year from filename
years <- gsub("[^0-9]", "", basename(tif_files)) |> as.numeric()

# Load Malawi districts
districts <- terra::vect("mwi_adm_nso_hotosm_20230405_shp/mwi_admbnda_adm2_nso_hotosm_20230405.shp")

# Reproject districts to match raster CRS
districts <- project(districts, pop_rasters[[1]])

# Identify correct district column
names(districts)   # LOOK HERE

 [1] "ADM2_EN"    "ADM2_PCODE" "ADM1_EN"    "ADM1_PCODE" "ADM0_EN"   
 [6] "ADM0_PCODE" "date"       "validOn"    "validTo"    "Shape_Leng"
[11] "Shape_Area"

district_col <- "ADM2_EN"   # change based on actual name

# Extract population for each raster
district_pop_list <- lapply(seq_along(pop_rasters), function(i) {
  r <- pop_rasters[[i]]
  yr <- years[i]

  vals <- terra::extract(r, districts, fun = sum, na.rm = TRUE)

  data.frame(
    district = districts[[district_col]],
    year = yr,
    population = vals[,2]
  )
})

district_pop_all <- dplyr::bind_rows(district_pop_list)

Creating an incidence per 1000 column

#changing name of population data per district
district_df <- district_pop_all %>% 
  rename(district = ADM2_EN)


malaria_with_pop <- malaria_df %>%
  left_join(district_df, by = c("district", "year"))

# 1. Create incidence per 1,000
malaria_with_pop <- malaria_with_pop %>%
  mutate(incidence_per_1k = (incidence / population) * 1000)

#view(malaria_with_pop)

Plotting incidence from 2018-2024

#Lineplot showing highest points of incidence


malaria_with_pop$year <- as.numeric(as.character(malaria_with_pop$year))
malaria_max <- malaria_with_pop %>%
  group_by(year) %>%
  summarise(
    max_incidence = max(incidence, na.rm = TRUE),
    .groups = "drop"
  )
ggplot(malaria_max, aes(x = year, y = max_incidence)) +
  geom_line(linewidth = 1.2, color = "darkred") +
  geom_point(size = 2, color = "darkred") +
  theme_minimal() +
  labs(
    title = "Malaria Incidence by Year 2018-2024",
    x = "Year",
    y = "Incidence"
  )

# -----------------------------
# 1. Filter data for 2018–2024
# -----------------------------
malaria_with_pop <- malaria_with_pop %>%
  mutate(year = as.numeric(year)) %>%
  filter(year >= 2018 & year <= 2024)

# -----------------------------
# 2. Convert months to numbers and create date
# -----------------------------
malaria_with_pop <- malaria_with_pop %>%
  mutate(
    month_num = match(tolower(trimws(month)), tolower(month.name)),
    date = as.Date(paste(year, month_num, "01", sep = "-"))
  )

# -----------------------------
# 3. Compute monthly peak incidence
# -----------------------------
malaria_monthly_max <- malaria_with_pop %>%
  group_by(date) %>%
  summarise(
    max_incidence = max(incidence, na.rm = TRUE),
    .groups = "drop"
  )

# -----------------------------
# 4. Ensure full monthly timeline (2018–2024)
# -----------------------------
full_dates <- data.frame(
  date = seq(as.Date("2018-01-01"), as.Date("2024-12-01"), by = "month")
)

malaria_monthly_max <- full_dates %>%
  left_join(malaria_monthly_max, by = "date")

# -----------------------------
# 5. Create labels for months and year
# -----------------------------
malaria_monthly_max <- malaria_monthly_max %>%
  mutate(
    month_label = format(date, "%b"),               # Jan, Feb, ...
    year_label = ifelse(format(date, "%m") == "01", format(date, "%Y"), "")
  )

# -----------------------------
# 6. Identify positions for vertical grid lines (start of each year)
# -----------------------------
year_start_dates <- malaria_monthly_max$date[format(malaria_monthly_max$date, "%m") == "01"]

# -----------------------------
# 7. Plot
# -----------------------------
ggplot(malaria_monthly_max, aes(x = date, y = max_incidence)) +
  geom_line(color = "darkred", linewidth = 1.2, na.rm = TRUE) +
  geom_point(color = "darkred", size = 1.8, na.rm = TRUE) +
  # vertical lines to mark start of each year
  geom_vline(xintercept = as.numeric(year_start_dates), color = "grey50", linetype = "dashed") +
  # x-axis formatting
  scale_x_date(
    breaks = malaria_monthly_max$date,
    labels = malaria_monthly_max$month_label,
    limits = c(as.Date("2018-01-01"), as.Date("2024-12-01"))
  ) +
  # Add year labels as horizontal text above plot
  annotate("text",
           x = year_start_dates,
           y = max(malaria_monthly_max$max_incidence, na.rm = TRUE) * 1.05,
           label = format(year_start_dates, "%Y"),
           vjust = 0,
           hjust = 0.5,
           size = 3.5) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1, size = 5),
    panel.grid.minor.x = element_blank()
  ) +
  labs(
    title = "Malaria Incidence (2018–2024)",
    x = "Month",
    y = "Incidence"
  ) +
  expand_limits(y = max(malaria_monthly_max$max_incidence, na.rm = TRUE) * 1.1)  # make room for year labels

# Ensure month is ordered
malaria_with_pop$month <- factor(malaria_with_pop$month,
                            levels = c("January","February","March","April","May","June",
                                       "July","August","September","October","November","December"))

#checking districts with highest incidence values:
monthly_year_district <- malaria_with_pop %>%
  group_by(district, year, month) %>%
  summarise(Total_Incidence = sum(incidence_per_1k, na.rm = TRUE), .groups = "drop")

# Heatmap faceted by district
ggplot(monthly_year_district, aes(x = month, y = factor(year), fill = Total_Incidence)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightyellow", high = "steelblue") +
  facet_wrap(~district, scales = "free_y") +
  labs(title = "Monthly Incidence per District per 1,000 (2018–2024)",
       x = "Month",
       y = "Year",
       fill = "Incidence") +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 5),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 10, face = "bold"),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 12),
    strip.text = element_text(face = "bold", size = 8),
    plot.title = element_text(face = "bold", size = 10, hjust = 0.5)
  )

#view(hum_df)

#removing the column annual from the climate datasets
temp_df <- temp_df[ , !(names(temp_df) %in% c("ann"))] 
rain_df <- rain_df[ , !(names(rain_df) %in% c("ann"))] 
hum_df <- hum_df[ , !(names(hum_df) %in% c("ann"))]
wind_df <- wind_df[ , !(names(wind_df) %in% c("ann"))]
hum_df <- hum_df %>% 
  rename("parameter" = "parameter-hum")

unique(malaria_with_pop$district)

 [1] "Mangochi"   "Lilongwe"   "Nkhotakota" "Chikwawa"   "Thyolo"    
 [6] "Kasungu"    "Karonga"    "Balaka"     "Blantyre"   "Dedza"     
[11] "Chiradzulu" "Ntcheu"     "Zomba"      "Rumphi"     "Mulanje"   
[16] "Dowa"       "Nkhatabay"  "Mzimba"     "Salima"     "Machinga"  
[21] "Chitipa"    "Nsanje"     "Neno"       "Mchinji"    "Ntchisi"   
[26] "Phalombe"   "Likoma"     "Mwanza"     "Mzuzu"

#converting climate datasets to data.table
setDT(rain_df)
setDT(wind_df)
setDT(temp_df)
setDT(hum_df)

#reshaping to long format using melt
months <- c("jan","feb","mar","apr","may","jun",
            "jul","aug","sep","oct","nov","dec")


rain_long <- melt(rain_df,
                  id.vars = c("year","lat","lon"),
                  measure.vars = months,
                  variable.name = "month",
                  value.name = "rainfall")

hum_long <- melt(hum_df,
                 id.vars = c("year","lat","lon"),
                 measure.vars = months,
                 variable.name = "month",
                 value.name = "humidity")

temp_long <- melt(temp_df,
                  id.vars = c("year","lat","lon"),
                  measure.vars = months,
                  variable.name = "month",
                  value.name = "temperature")

wind_long <- melt(wind_df,
                  id.vars = c("year","lat","lon"),
                  measure.vars = months,
                  variable.name = "month",
                  value.name = "windspeed")

#converting month names to number and creating a date column
climate_list <- list(rain_long, hum_long, temp_long, wind_long)

for(i in seq_along(climate_list)){
  climate_list[[i]][, month_num := match(tolower(month), tolower(month.abb))]
  climate_list[[i]][, date := as.Date(paste(year, month_num, "01", sep = "-"))]
}

# Assign back
rain_long <- climate_list[[1]]
hum_long  <- climate_list[[2]]
temp_long <- climate_list[[3]]
wind_long <- climate_list[[4]]

#keeping only necessary columns
rain_long <- rain_long[,  .(year, lat, lon, month, date, rainfall)]
hum_long  <- hum_long[,   .(year, lat, lon, month, date, humidity)]
temp_long <- temp_long[,  .(year, lat, lon, month, date, temperature)]
wind_long <- wind_long[,  .(year, lat, lon, month, date, windspeed)]

#merging climate datasets
climate_df <- merge(rain_long, hum_long,
                    by = c("year","lat","lon","month","date"), all = TRUE)
climate_df <- merge(climate_df, temp_long,
                    by = c("year","lat","lon","month","date"), all = TRUE)
climate_df <- merge(climate_df, wind_long,
                    by = c("year","lat","lon","month","date"), all = TRUE)
head(climate_df)

Key: <year, lat, lon, month, date>
    year   lat    lon  month       date rainfall humidity temperature windspeed
   <num> <num>  <num> <fctr>     <Date>    <num>    <num>       <num>     <num>
1:  2018   -17 33.125    jan 2018-01-01     0.62    58.22       27.42      3.02
2:  2018   -17 33.125    feb 2018-02-01    11.33    83.60       24.88      2.08
3:  2018   -17 33.125    mar 2018-03-01     3.78    78.75       24.98      2.39
4:  2018   -17 33.125    apr 2018-04-01     0.81    70.54       23.97      2.83
5:  2018   -17 33.125    may 2018-05-01     0.24    62.94       23.01      2.64
6:  2018   -17 33.125    jun 2018-06-01     0.01    57.74       20.15      2.77

Plotting climate

# -----------------------------
# 1. Convert to long format
# -----------------------------
climate_long <- climate_df %>%
  select(date, rainfall, humidity, temperature, windspeed) %>%
  pivot_longer(
    cols = c(rainfall, humidity, temperature, windspeed),
    names_to = "variable",
    values_to = "value"
  )

# -----------------------------
# 2. Ensure full monthly timeline
# -----------------------------
full_dates <- expand.grid(
  date = seq(as.Date("2018-01-01"), as.Date("2024-12-01"), by = "month"),
  variable = unique(climate_long$variable)
)

climate_long <- full_dates %>%
  left_join(climate_long, by = c("date", "variable"))

# -----------------------------
# 3. Month and year labels
# -----------------------------
climate_long <- climate_long %>%
  mutate(
    month_label = format(date, "%b"),
    year_start = format(date, "%m") == "01"
  )

year_start_dates <- unique(climate_long$date[climate_long$year_start])

# -----------------------------
# 4. Plot: lines + months + years
# -----------------------------
ggplot(climate_long, aes(x = date, y = value, group = variable)) +

  # Line plot
  geom_line(color = "steelblue", linewidth = 0.9, na.rm = TRUE) +

  # Vertical dashed lines at start of each year
  geom_vline(
    xintercept = as.numeric(year_start_dates),
    color = "grey70",
    linetype = "dashed",
    size = 0.5
  ) +

  # X-axis: monthly ticks
  scale_x_date(
    date_labels = "%b",
    date_breaks = "1 month",
    limits = c(as.Date("2018-01-01"), as.Date("2024-12-01"))
  ) +

  # Facet per variable
  facet_wrap(~ variable, scales = "free_y", ncol = 1) +

  # Year labels above facets
  geom_text(
    data = data.frame(date = year_start_dates),
    aes(
      x = date,
      y = max(climate_long$value, na.rm = TRUE) * 1.05,
      label = format(date, "%Y")
    ),
    inherit.aes = FALSE,
    vjust = 0,
    hjust = 0.5,
    size = 3.5,
    color = "grey30"
  ) +

  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1, size = 6),
    panel.grid.minor.x = element_blank(),
    strip.text = element_text(face = "bold"),
    plot.margin = margin(10, 10, 20, 10)
  ) +

  labs(
    title = "Monthly Climatic Trends (2018–2024)",
    x = "Month",
    y = "Value"
  )

#Geolocating points in the climate data and linking it to distrits for merging with the malaria data

# Load Malawi districts shapefile

districts <- st_read("mwi_adm_nso_hotosm_20230405_shp/mwi_admbnda_adm2_nso_hotosm_20230405.shp")

Reading layer `mwi_admbnda_adm2_nso_hotosm_20230405' from data source 
  `/Users/m1/Downloads/Research/Malaria-Incidence-Prediction/mwi_adm_nso_hotosm_20230405_shp/mwi_admbnda_adm2_nso_hotosm_20230405.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 32 features and 11 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 32.67164 ymin: -17.12975 xmax: 35.91848 ymax: -9.367346
Geodetic CRS:  WGS 84

# Transform to same CRS as your climate points
districts <- st_transform(districts, crs = 4326)  # WGS84


# 2. Convert climate dataframe to sf

# Replace 'climate_df_raw' with your actual dataframe
climate_sf <- st_as_sf(climate_df, coords = c("lon", "lat"), crs = 4326)


# 3. Spatial join with districts (buffer + nearest fallback)

# First, buffer districts slightly to catch edge points
malawi_buffered <- st_buffer(districts, dist = 0.01)

# Initial join using buffer
points_with_district <- st_join(climate_sf, malawi_buffered["ADM2_EN"], join = st_within)

# Handle points still NA (nearest district)
na_points <- points_with_district[is.na(points_with_district$ADM2_EN), ]
nearest_index <- st_nearest_feature(na_points, districts)
na_points$ADM2_EN <- districts$ADM2_EN[nearest_index]
points_with_district$ADM2_EN[is.na(points_with_district$ADM2_EN)] <- na_points$ADM2_EN


# 4. Clean dataframe
colnames(points_with_district)

[1] "year"        "month"       "date"        "rainfall"    "humidity"   
[6] "temperature" "windspeed"   "ADM2_EN"     "geometry"

head(points_with_district)

Simple feature collection with 6 features and 8 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 33.125 ymin: -17 xmax: 33.125 ymax: -17
Geodetic CRS:  WGS 84
  year month       date rainfall humidity temperature windspeed  ADM2_EN
1 2018   jan 2018-01-01     0.62    58.22       27.42      3.02 Chikwawa
2 2018   feb 2018-02-01    11.33    83.60       24.88      2.08 Chikwawa
3 2018   mar 2018-03-01     3.78    78.75       24.98      2.39 Chikwawa
4 2018   apr 2018-04-01     0.81    70.54       23.97      2.83 Chikwawa
5 2018   may 2018-05-01     0.24    62.94       23.01      2.64 Chikwawa
6 2018   jun 2018-06-01     0.01    57.74       20.15      2.77 Chikwawa
            geometry
1 POINT (33.125 -17)
2 POINT (33.125 -17)
3 POINT (33.125 -17)
4 POINT (33.125 -17)
5 POINT (33.125 -17)
6 POINT (33.125 -17)

district_df <- district_pop_all %>% 
  rename(district = ADM2_EN)

head(district_df)

   district year population
1   Chitipa 2018  244071.83
2   Karonga 2018  379911.26
3 Nkhatabay 2018  295123.36
4    Rumphi 2018  238295.04
5    Mzimba 2018  976468.82
6    Likoma 2018   14986.53

view(district_df)

# 2. Ensure month is ordered
malaria_with_pop$month <- factor(malaria_with_pop$month, levels = month.name)

# 3. Summarize incidence per district per month
monthly_district_norm <- malaria_with_pop %>%
  group_by(district, month) %>%
  summarise(Incidence_per_1k = sum(incidence_per_1k, na.rm = TRUE),
            .groups = "drop")

# 4. Plot heatmap
ggplot(monthly_district_norm, aes(x = month, y = district, fill = Incidence_per_1k)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightyellow", high = "steelblue", name = "Incidence\n(per 1,000)") +
  labs(title = "Monthly Malaria Incidence per District (per 1,000 people)",
       x = "Month",
       y = "District") +
  theme_minimal(base_size = 12) +
  theme(
    axis.title.x = element_text(size = 12, face = "bold"),
    axis.title.y = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
    axis.text.y = element_text(size = 8),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5)
  )

climate_df <- points_with_district %>%
  st_drop_geometry() %>%
  select(year, month, date, rainfall, humidity, temperature, windspeed, ADM2_EN) %>%
  rename(district = ADM2_EN)

# Check results
head(climate_df)

  year month       date rainfall humidity temperature windspeed district
1 2018   jan 2018-01-01     0.62    58.22       27.42      3.02 Chikwawa
2 2018   feb 2018-02-01    11.33    83.60       24.88      2.08 Chikwawa
3 2018   mar 2018-03-01     3.78    78.75       24.98      2.39 Chikwawa
4 2018   apr 2018-04-01     0.81    70.54       23.97      2.83 Chikwawa
5 2018   may 2018-05-01     0.24    62.94       23.01      2.64 Chikwawa
6 2018   jun 2018-06-01     0.01    57.74       20.15      2.77 Chikwawa

unique(climate_df$district)

 [1] "Chikwawa"      "Nsanje"        "Thyolo"        "Mulanje"      
 [5] "Dedza"         "Mwanza"        "Blantyre"      "Zomba"        
 [9] "Lilongwe"      "Ntcheu"        "Balaka"        "Machinga"     
[13] "Mangochi"      "Mchinji"       "Lilongwe City" "Dowa"         
[17] "Salima"        "Kasungu"       "Nkhotakota"    "Mzimba"       
[21] "Likoma"        "Nkhatabay"     "Rumphi"        "Chitipa"      
[25] "Karonga"

# 5. Plot Malawi map with district labels
districts_sf <- st_as_sf(districts)

# Plot
ggplot() +
  geom_sf(data = districts_sf, fill = NA, color = "black") +
  geom_sf_text(data = districts_sf, aes(label = ADM2_EN), size = 2, color = "blue") +
  theme_minimal() +
  labs(title = "Malawi Districts", caption = "Source: Malawi shapefile")

##Facets for climate per district

#unique(climate_df$district)

# -----------------------------
# 1. Clean district names
# -----------------------------
climate_df <- climate_df %>%
  mutate(
    district = case_when(
      tolower(district) %in% c("lilongwe city", "lilongwe city council") ~ "Lilongwe",
      TRUE ~ district
    )
  )

# -----------------------------
# 2. Convert to long format
# -----------------------------
climate_long <- climate_df %>%
  select(district, date, rainfall, humidity, temperature, windspeed) %>%
  pivot_longer(
    cols = c(rainfall, humidity, temperature, windspeed),
    names_to = "variable",
    values_to = "value"
  )

# -----------------------------
# 3. Ensure full monthly timeline per district
# -----------------------------
full_dates <- expand.grid(
  date = seq(as.Date("2018-01-01"), as.Date("2024-12-01"), by = "month"),
  district = unique(climate_long$district)
)

climate_long <- full_dates %>%
  left_join(climate_long, by = c("date", "district"))

# -----------------------------
# 4. Month label and year start dates
# -----------------------------
climate_long <- climate_long %>%
  mutate(month_label = format(date, "%b"))

year_start_dates <- unique(climate_long$date[format(climate_long$date, "%m") == "01"])

# -----------------------------
# 5. Create output directory
# -----------------------------
base_dir <- "climate_plots_by_district"
dir.create(base_dir, showWarnings = FALSE)

# -----------------------------
# 6. Loop per district
# -----------------------------
for (d in unique(climate_long$district)) {

  district_dir <- file.path(base_dir, d)
  dir.create(district_dir, showWarnings = FALSE)

  district_data <- filter(climate_long, district == d)

  p <- ggplot(district_data, aes(x = date, y = value, group = variable)) +

    # Smoothed peak trend
    geom_smooth(
      method = "loess",
      span = 0.25,
      se = FALSE,
      color = "steelblue",
      linewidth = 1.1,
      na.rm = TRUE
    ) +

    # Vertical dashed lines at the start of each year
    geom_vline(
      xintercept = as.numeric(year_start_dates),
      color = "grey70",
      linetype = "dashed",
      size = 0.5
    ) +

    # X-axis: monthly breaks, labels as month abbreviations
    scale_x_date(
      date_labels = "%b",
      date_breaks = "1 month",
      limits = c(as.Date("2018-01-01"), as.Date("2024-12-01"))
    ) +

    # Facet by variable
    facet_wrap(~ variable, scales = "free_y", ncol = 1) +

    # Year labels above plot
    geom_text(
      data = data.frame(date = year_start_dates),
      aes(x = date, y = max(district_data$value, na.rm = TRUE) * 1.05,
          label = format(date, "%Y")),
      inherit.aes = FALSE,
      vjust = 0,
      hjust = 0.5,
      size = 3.5,
      color = "grey30"
    ) +

    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 60, hjust = 1, size = 7),
      axis.text.y = element_text(size = 7),
      panel.grid.minor = element_blank(),
      strip.text = element_text(face = "bold", size = 12),
      panel.spacing = unit(1.6, "lines"),
      plot.margin = margin(20, 20, 30, 20)
    ) +

    labs(
      title = paste("Climatic Trends in", d, "(2018–2024)"),
      x = "Month",
      y = "Value"
    )

  # Save to file
  ggsave(
    filename = file.path(district_dir, paste0("climate_trends_", d, ".png")),
    plot = p,
    width = 12,
    height = 16,
    units = "in",
    dpi = 300
  )
}

#merging climate with malaria data
setDT(malaria_with_pop)
head(malaria_with_pop)

   district facility         cluster    month  year incidence population
     <char>   <char>          <char>   <fctr> <num>     <num>      <num>
1: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
2: Mangochi      DHO Aa-salam Clinic    March  2019       794    1228651
3: Mangochi      DHO Aa-salam Clinic    April  2019       308    1228651
4: Mangochi      DHO Aa-salam Clinic      May  2019       276    1228651
5: Mangochi      DHO Aa-salam Clinic     June  2019       291    1228651
6: Mangochi      DHO Aa-salam Clinic     July  2019       170    1228651
   incidence_per_1k month_num       date
              <num>     <int>     <Date>
1:        0.3670691         2 2019-02-01
2:        0.6462370         3 2019-03-01
3:        0.2506814         4 2019-04-01
4:        0.2246365         5 2019-05-01
5:        0.2368451         6 2019-06-01
6:        0.1383631         7 2019-07-01

head(malaria_with_pop)

   district facility         cluster    month  year incidence population
     <char>   <char>          <char>   <fctr> <num>     <num>      <num>
1: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
2: Mangochi      DHO Aa-salam Clinic    March  2019       794    1228651
3: Mangochi      DHO Aa-salam Clinic    April  2019       308    1228651
4: Mangochi      DHO Aa-salam Clinic      May  2019       276    1228651
5: Mangochi      DHO Aa-salam Clinic     June  2019       291    1228651
6: Mangochi      DHO Aa-salam Clinic     July  2019       170    1228651
   incidence_per_1k month_num       date
              <num>     <int>     <Date>
1:        0.3670691         2 2019-02-01
2:        0.6462370         3 2019-03-01
3:        0.2506814         4 2019-04-01
4:        0.2246365         5 2019-05-01
5:        0.2368451         6 2019-06-01
6:        0.1383631         7 2019-07-01

#merging the climate and malaria data
# Make sure the date columns are Date type
malaria_with_pop$date <- as.Date(malaria_with_pop$date)
climate_df$date <- as.Date(climate_df$date)
climate_df <- climate_df %>%
  mutate(month = month.name[match(tolower(month), tolower(month.abb))])


malaria_with_pop <- malaria_with_pop %>% mutate(year = as.numeric(as.character(year)))
climate_df <- climate_df %>% mutate(year = as.numeric(as.character(year)))

head(climate_df)

  year    month       date rainfall humidity temperature windspeed district
1 2018  January 2018-01-01     0.62    58.22       27.42      3.02 Chikwawa
2 2018 February 2018-02-01    11.33    83.60       24.88      2.08 Chikwawa
3 2018    March 2018-03-01     3.78    78.75       24.98      2.39 Chikwawa
4 2018    April 2018-04-01     0.81    70.54       23.97      2.83 Chikwawa
5 2018      May 2018-05-01     0.24    62.94       23.01      2.64 Chikwawa
6 2018     June 2018-06-01     0.01    57.74       20.15      2.77 Chikwawa

# Join datasets by district and date
merged_df <- malaria_with_pop %>%
  left_join(climate_df, by = c("district", "date", "year","month"))


# Checking result of merge
head(merged_df)

   district facility         cluster    month  year incidence population
     <char>   <char>          <char>   <char> <num>     <num>      <num>
1: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
2: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
3: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
4: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
5: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
6: Mangochi      DHO Aa-salam Clinic February  2019       451    1228651
   incidence_per_1k month_num       date rainfall humidity temperature
              <num>     <int>     <Date>    <num>    <num>       <num>
1:        0.3670691         2 2019-02-01     7.04    82.88       24.38
2:        0.3670691         2 2019-02-01     5.01    86.72       22.87
3:        0.3670691         2 2019-02-01     8.42    76.47       25.42
4:        0.3670691         2 2019-02-01     6.90    87.05       22.56
5:        0.3670691         2 2019-02-01     9.40    79.18       24.44
6:        0.3670691         2 2019-02-01     8.85    87.16       22.31
   windspeed
       <num>
1:      1.75
2:      1.74
3:      2.67
4:      1.68
5:      2.43
6:      1.59

# Summarize incidence per district per year
yearly_district <- malaria_with_pop %>%
  group_by(district, year) %>%
  summarise(Total_Incidence = sum(incidence_per_1k, na.rm = TRUE), .groups = "drop")

# Make year a factor for proper ordering
yearly_district$year <- factor(yearly_district$year, levels = 2018:2025)

# Heatmap
ggplot(yearly_district, aes(x = year, y = district, fill = Total_Incidence)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightyellow", high = "steelblue") +
  labs(title = "Annual Incidence per District per 1,000 (2018–2024)",
       x = "Year",
       y = "District",
       fill = "Incidence") +
  theme_minimal(base_size = 12) +
  theme(
    axis.title.x = element_text(size = 10, face = "bold"),
    axis.title.y = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 8),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    plot.title = element_text(face = "bold", size = 10, hjust = 0.5)
  )

# 1. Convert SpatVector to sf
districts_sf <- st_as_sf(districts)

# 2. Aggregate malaria incidence
malaria_agg <- malaria_with_pop %>%
  group_by(district, year) %>%
  summarise(total_incidence = sum(incidence_per_1k, na.rm = TRUE)) %>%
  ungroup()

# 3. Join with districts
malaria_map_data <- districts_sf %>%
  left_join(malaria_agg, by = c("ADM2_EN" = "district")) %>%
  filter(!is.na(total_incidence))

# 4. Plot choropleth
ggplot(malaria_map_data) +
  geom_sf(aes(fill = total_incidence), color = "black", size = 0.2) +
  scale_fill_gradient(low = "#FFF7BC", high = "#D95F0E", name = "Incidence") +
  theme_minimal() +
  labs(title = "Malaria Incidence per District in Malawi (2018-2024)") +
  facet_wrap(~ year) +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank()
  )

#making correlation for incidence with climate variables
# Define climate variables
climate_vars <- c("rainfall", "humidity", "temperature", "windspeed")

# Function to compute correlation and p-value safely
cor_test_safe <- function(x, y) {
  if(sum(!is.na(x) & !is.na(y)) > 1) {
    test <- cor.test(x, y, use = "complete.obs")
    tibble(correlation = test$estimate, p_value = test$p.value)
  } else {
    tibble(correlation = NA_real_, p_value = NA_real_)
  }
}

# Compute correlation and p-value per district and variable
correlation_per_district <- merged_df %>%
  group_by(district) %>%
  summarise(
    across(
      all_of(climate_vars),
      ~ list(cor_test_safe(incidence, .x)), 
      .names = "{.col}"
    ),
    .groups = "drop"
  ) %>%
  mutate(across(all_of(climate_vars), ~ map_dfr(.x, ~.x))) %>%
  unnest(cols = everything(), names_sep = "_")


# Reshape to long format
cor_long <- correlation_per_district %>%
  pivot_longer(
    cols = matches(paste0("^(", paste(climate_vars, collapse = "|"), ")_")),
    names_to = c("climate_variable", ".value"),
    names_sep = "_"
  ) %>%
  filter(!is.na(correlation))


# Function to convert p-values to significance stars
p_to_stars <- function(p) {
  case_when(
    p < 0.001 ~ "***",
    p < 0.01  ~ "**",
    p < 0.05  ~ "*",
    p < 0.1   ~ ".",
    TRUE      ~ ""
  )
}

colnames(cor_long)

[1] "district"         "climate_variable" "correlation"      "p"

cor_long <- cor_long %>%
  mutate(stars = p_to_stars(p))

# Plot heatmap with correlation and stars
ggplot(cor_long, aes(x = climate_variable, y = district, fill = correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = stars), color = "black", size = 5) +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Correlation of Malaria Incidence vs Climate Variables per District",
    x = "Climate Variable",
    y = "District",
    subtitle = "Significance indicated by stars: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1"
  )

#creating month lags for climate variables vs incidence
#create monthly lags 1 - 5

# 1. Define helper functions

# Safe correlation test (returns tibble even if correlation fails)
cor_test_safe <- function(x, y) {
  tryCatch({
    res <- cor.test(x, y, method = "pearson")
    tibble(correlation = unname(res$estimate),
           p_value = res$p.value)
  }, error = function(e) tibble(correlation = NA, p_value = NA))
}

# Convert p-values to significance stars
p_to_stars <- function(p) {
  case_when(
    is.na(p) ~ "",
    p < 0.001 ~ "***",
    p < 0.01  ~ "**",
    p < 0.05  ~ "*",
    p < 0.1   ~ ".",
    TRUE ~ ""
  )
}


# 2. Define climate variables

climate_vars <- c("rainfall", "humidity", "temperature", "windspeed")


# 3. Create lagged climate variables (1–5 months)

lagged_df <- merged_df %>%
  group_by(district) %>%
  arrange(date, .by_group = TRUE) %>%
  mutate(across(
    all_of(climate_vars),
    list(
      lag1 = ~lag(.x, 1),
      lag2 = ~lag(.x, 2),
      lag3 = ~lag(.x, 3),
      lag4 = ~lag(.x, 4),
      lag5 = ~lag(.x, 5),
      lag6 = ~lag(.x, 6)
    ),
    .names = "{.col}_{.fn}"
  )) %>%
  ungroup()


# 4. Compute correlations for each district and lag

climate_lag_vars <- grep("rainfall|humidity|temperature|windspeed", names(lagged_df), value = TRUE)

correlation_lags <- lagged_df %>%
  group_by(district) %>%
  summarise(
    across(
      all_of(climate_lag_vars),
      ~ list(cor_test_safe(incidence, .x)), 
      .names = "{.col}"
    ),
    .groups = "drop"
  ) %>%
  mutate(across(all_of(climate_lag_vars), ~ map_dfr(.x, ~.x))) %>%
  unnest(cols = everything(), names_sep = "_")


# 5. Reshape to long format for plotting/analysis

cor_long_lags <- correlation_lags %>%
  pivot_longer(
    cols = matches("_(lag[1-5])_"),
    names_to = c("climate_variable", "lag", ".value"),
    names_pattern = "(.*)_(lag[1-6])_(.*)"
  ) %>%
  filter(!is.na(correlation)) %>%
  mutate(stars = p_to_stars(p_value))

# Add this short line below:
cor_long_lags <- cor_long_lags %>%
  mutate(stars = ifelse(stars == "", "none", stars))

#correlation heatmaps with district facets
cor_long_lags_heat <- cor_long_lags %>%
  mutate(stars = ifelse(stars == "none", "", stars))

ggplot(cor_long_lags_heat,
       aes(x = lag, y = climate_variable, fill = correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = stars), color = "black", size = 4, fontface = "bold") +
  scale_fill_gradient2(
    low = "blue", mid = "white", high = "red",
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  facet_wrap(~ district, ncol = 4) +
  theme_minimal(base_size = 12) +
  labs(
    title = "Lagged Climate–Malaria Correlation Matrices by District (1–5 Month Lags)",
    x = "Lag (months)",
    y = "Climate Variable"
  ) +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    strip.text = element_text(face = "bold")
  )

#plotting lagged values per year
lagged_df <- lagged_df %>%
  mutate(year = year(date))

#getting correlatioms per year
correlation_lags_year <- lagged_df %>%
  group_by(district, year) %>%
  summarise(
    across(
      all_of(climate_lag_vars),
      ~ list(cor_test_safe(incidence, .x)),
      .names = "{.col}"
    ),
    .groups = "drop"
  ) %>%
  mutate(across(all_of(climate_lag_vars), ~ map_dfr(.x, ~.x))) %>%
  unnest(cols = everything(), names_sep = "_")

#reshaping to long format
cor_long_lags_year <- correlation_lags_year %>%
  pivot_longer(
    cols = matches("_(lag[1-5])_"),
    names_to = c("climate_variable", "lag", ".value"),
    names_pattern = "(.*)_(lag[1-5])_(.*)"
  ) %>%
  filter(!is.na(correlation)) %>%
  mutate(
    stars = p_to_stars(p_value),
    stars = ifelse(stars == "none", "", stars),
    lag = factor(lag, levels = paste0("lag", 1:5))
  )

#plotting facets per year
ggplot(cor_long_lags_year,
       aes(x = lag, y = climate_variable, fill = correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = stars), color = "black", size = 3, fontface = "bold") +
  scale_fill_gradient2(
    low = "blue", mid = "white", high = "red",
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  facet_grid(year ~ district) +  # rows = year, columns = district
  theme_minimal(base_size = 12) +
  labs(
    title = "Lagged Climate–Malaria Correlation Matrices by District and Year",
    x = "Lag (months)",
    y = "Climate Variable"
  ) +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 10),
    strip.text = element_text(face = "bold")
  )

# PLOT: Lagged Climate–Malaria Correlations by District and Year
ggplot(cor_long_lags_year,
            aes(x = lag, y = climate_variable, fill = correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = stars), color = "black", size = 3, fontface = "bold") +
  scale_fill_gradient2(
    low = "blue", mid = "white", high = "red",
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  
  # ---- FIX SQUASHING ----
  facet_wrap(~ year + district, ncol = 6, scales = "free") +
  
  theme_minimal(base_size = 12) +
  labs(
    title = "Lagged Climate–Malaria Correlation Matrices by District and Year",
    x = "Lag (months)",
    y = "Climate Variable"
  ) +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 10),
    strip.text = element_text(face = "bold", size = 10),
    plot.title = element_text(face = "bold", size = 14)
  )