This report documents the process of modelling the potential distribution of Culex mosquitoes in Rwanda using Ecological Niche Modelling (ENM) with MaxEnt.
MaxEnt is a machine learning algorithm based on the principle of maximum entropy. It estimates the distribution of a species by relating known occurrence locations to environmental predictors, producing a habitat suitability surface.
The objectives are:
Map the potential distribution of Culex mosquitoes in Rwanda.
Identify the most important environmental predictors.
Evaluate model performance using cross-validation.
Visualize diagnostics and response curves.
In this section, we prepare the datasets required for modelling. We use two categories of input data:
Species occurrence data – georeferenced presence points of Culex.
Environmental predictors – climatic and ecological variables (temperature, precipitation, elevation, NDVI, and land cover).
# Core spatial and modeling libraries
library(terra) # handles SpatRaster / fast raster ops
library(sf) # vector data
library(raster) # Raster* objects needed by dismo::maxent
library(dismo) # MaxEnt interface
library(usdm) # VIF
library(corrplot) # correlation plotting
library(ggplot2)
library(viridis)
library(dplyr)
library(stringr)
The national boundary was used to crop and mask predictor variables.
rwanda <- vect("Data/gadm41_RWA_shp/gadm41_RWA_0.shp") # National boundaries
rwanda1 <- vect("Data/gadm41_RWA_shp/gadm41_RWA_1.shp") # Region/Province boundaries
plot(rwanda1, main="Rwanda Boundary")
We used WorldClim v2.1 bioclimatic variables (~1 km resolution). These capture long-term trends and seasonality of climate.
# Bioclim variables
bioclim_files <- list.files("Data/wc2.1_30s_bio", pattern = "\\.tif$", full.names = TRUE)
bioclim_stack <- rast(bioclim_files) |> crop(rwanda) |> mask(rwanda)
plot(bioclim_stack[[1]], main = names(bioclim_stack)[1])
Additional predictors included elevation, NDVI, and land cover. These were harmonized to the same extent, resolution, and CRS with Bioclimatic variables.
# Load other environmental files
other_files <- list.files("Data/Rasters", pattern="\\.tif$", full.names = TRUE)
# Function to harmonize rasters
harmonize_rasters <- function(files, template, categorical_patterns = c("map", "landcover", "lc")) {
rasters_aligned <- lapply(files, function(f) {
r <- rast(f)
fname <- basename(f)
# Check if the raster is categorical based on filename
is_cat <- any(str_detect(tolower(fname), categorical_patterns))
method <- ifelse(is_cat, "near", "bilinear")
message("Processing: ", fname, " | Method: ", method)
# Reproject if CRS differs
if (!compareCRS(r, template)) {
r <- project(r, template)
}
# Resample to match template
r <- resample(r, template, method = method)
# Crop to template extent
r <- crop(r, template)
return(r)
})
# Combine into a multi-layer SpatRaster
stack <- rast(rasters_aligned)
names(stack) <- tools::file_path_sans_ext(basename(files))
return(stack)
}
# Use the first bioclim layer as template
template <- bioclim_stack[[1]]
# Harmonize
other_stack <- harmonize_rasters(other_files, template)
# Plot to check
plot(other_stack)
We used geo-referenced presence-only occurrence points of Culex quinquefasciatus.
Since absence data were not available, pseudo-absence background points were randomly generated across Rwanda.
Presence Data
# Load cleaned presence data
pres <- read.csv("Data/Presence/Final_Species_data.csv") %>%
mutate(
longitude = as.numeric(gsub("c\\(|\\)", "", geometry)),
latitude = as.numeric(gsub("\\)", "", X))
) %>%
select(Species, longitude, latitude)
pres_sf <- st_as_sf(pres, coords=c("longitude","latitude"), crs=4326)
# Plot presence
plot(rwanda1, main="Culex Presence")
plot(st_geometry(pres_sf), add=TRUE, col="red", pch=16, cex=0.8)
Background Data
# Background points
set.seed(123)
bg_points <- spatSample(rwanda, size=10000, method="random")
bg_sf <- st_as_sf(bg_points)
# Plot presence and background points
plot(rwanda1, main="Presence (red) & Background (blue)")
plot(st_geometry(bg_sf), add=TRUE, col="blue", pch=20, cex=0.4)
plot(st_geometry(pres_sf), add=TRUE, col="red", pch=16, cex=0.8)
Multicollinearity among predictors inflates model variance and reduces interpretability. We addressed this by applying:
all_stack <- c(bioclim_stack, other_stack)
pres_vals <- terra::extract(all_stack, pres_sf)
bg_vals <- terra::extract(all_stack, bg_sf)
sdm_data <- rbind(
cbind(st_coordinates(pres_sf), pres_vals, presence=1),
cbind(st_coordinates(bg_sf), bg_vals, presence=0)
)
predictors <- subset(sdm_data, select=-c(ID, X, Y, presence))
predictors_cont <- subset(predictors, select=-Rwanda_LandCover_ESA_2020)
cor_mat <- cor(predictors_cont, use="pairwise.complete.obs")
corrplot(cor_mat, method="color", tl.cex=0.7)
vif_result <- vifstep(predictors_cont, th=5)
selected_vars <- vif_result@results$Variables
selected_vars
## [1] "wc2.1_30s_bio_15" "wc2.1_30s_bio_18"
## [3] "wc2.1_30s_bio_19" "wc2.1_30s_bio_2"
## [5] "wc2.1_30s_bio_3" "wc2.1_30s_bio_4"
## [7] "wc2.1_30s_bio_6" "Rwanda_NDVI_2023_Sentinel2"
We ran two models:
Cross-validation (5 replicates) – for robust evaluation.
Single model (all data, jackknife enabled) – for diagnostic plots.
predictor_stack <- all_stack[[c(selected_vars, "Rwanda_LandCover_ESA_2020")]]
predictor_stack_raster <- raster::stack(predictor_stack)
pres_coords <- as.matrix(st_coordinates(pres_sf))
bg_coords <- as.matrix(st_coordinates(bg_sf))
# Replicated model
maxent_model_cv <- maxent(
x = predictor_stack_raster,
p = pres_coords,
a = bg_coords,
args = c("replicates=5", "replicatetype=crossvalidate")
)
# Single model with jackknife
maxent_model_single <- maxent(
x = predictor_stack_raster,
p = pres_coords,
a = bg_coords,
args = c("jackknife=true", "responsecurves=true")
)
The predicted suitability was averaged across cross-validation replicates.
suitability_map <- predict(maxent_model_cv, predictor_stack_raster)
## This is MaxEnt version 3.4.3
suitability_mean <- calc(suitability_map, mean)
suit_df <- as.data.frame(suitability_mean, xy=TRUE)
colnames(suit_df) <- c("x","y","suitability")
rwanda_sf <- st_as_sf(rwanda)
ggplot() +
geom_raster(data=suit_df, aes(x=x, y=y, fill=suitability)) +
scale_fill_viridis(name="Suitability", option="C", direction=-1) +
geom_sf(data=rwanda_sf, fill=NA, color="black") +
geom_point(data=as.data.frame(st_coordinates(pres_sf)), aes(x=X, y=Y),
col="red", size=1.5) +
theme_minimal() +
labs(title="Culex Suitability in Rwanda",
subtitle="Predicted using MaxEnt")
We assessed model accuracy using AUC (Area Under the ROC Curve). Values range from 0.5 (random) to 1.0 (perfect discrimination).
auc_rows <- grep("auc", rownames(maxent_model_cv@results), ignore.case=TRUE, value=TRUE)
auc_values <- maxent_model_cv@results[auc_rows, , drop=FALSE]
auc_values
## species_0 species_1 species_2 species_3 species_4
## Training.AUC 0.9105 0.9228 0.9254 0.9136 0.9180
## Test.AUC 0.8977 0.8748 0.8523 0.9243 0.8813
## AUC.Standard.Deviation 0.0265 0.0261 0.0280 0.0225 0.0248
## species (average)
## Training.AUC 0.9181
## Test.AUC 0.8861
## AUC.Standard.Deviation 0.0256
auc_df <- data.frame(
replicate=1:5,
Train=as.numeric(maxent_model_cv@results["Training.AUC",1:5]),
Test=as.numeric(maxent_model_cv@results["Test.AUC",1:5])
)
ggplot(auc_df, aes(x=factor(replicate))) +
geom_col(aes(y=Test, fill="Test"), position="dodge") +
geom_col(aes(y=Train, fill="Train"), position="dodge") +
scale_fill_manual(values=c("Test"="steelblue", "Train"="darkred")) +
theme_minimal() +
labs(title="Training vs Test AUC across Replicates", x="Replicate", y="AUC")
Variable Importance
We used percent contribution and permutation importance to evaluate predictor influence. ## Variable contribution
var_contrib <- maxent_model_cv@results[grep("contribution", rownames(maxent_model_cv@results)), ]
# Save results as a dataframe
var_contrib_df <- data.frame(
Variable = gsub(".contribution", "", rownames(var_contrib)),
Contribution = as.numeric(var_contrib[,1])
)
# Plot
ggplot(var_contrib_df, aes(x = reorder(Variable, Contribution), y = Contribution)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Variable Contributions", y = "% Contribution", x = "Variable") +
theme_minimal(base_size = 14) #+
#ggsave("Results/Variable_contribution.png", width=6, height=5)
var_perm <- maxent_model_cv@results[grep("permutation", rownames(maxent_model_cv@results)), ]
# Save results as a dataframe
#perm_import <- results[grep("permutation", rownames(var_perm)), ]
perm_import_df <- data.frame(
Variable = gsub(".permutation.importance", "", rownames(var_perm)),
Importance = as.numeric(var_perm[,1])
)
var_contrib_df <- data.frame(
Variable = gsub(".contribution", "", rownames(var_contrib)),
Contribution = as.numeric(var_contrib[,1])
)
## Plot
ggplot(perm_import_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Permutation Importance", y = "% Importance", x = "Variable") +
theme_minimal(base_size = 14) #+
#ggsave("Results/Permutation_importance.png", width=6, height=5)
Response curves show how predicted suitability changes with each variable. * Increasing Curve: Suggests a positive relationship, where higher values of the variable are associated with a higher probability of presence. * Decreasing Curve: Indicates a negative relationship, where higher values of the variable lead to a lower probability of presence. * Flat or Saturation Point: Can signify a point where the variable no longer influences the probability of presence, or the species has reached its saturation point.
response(maxent_model_single)
Phillips SJ, Anderson RP, Schapire RE. (2006). Maximum entropy modeling of species geographic distributions. Ecological Modelling, 190(3–4):231–259.
Hijmans RJ, Phillips S, Leathwick J, Elith J. (2017). dismo: Species Distribution Modeling. R package version 1.1-4.
R Core Team. (2023). R: A Language and Environment for Statistical Computing.