Cholera Disease Project - Environmental Raster Preparation

Introduction: Environmental Data Preparation for Cholera Ecological Niche Modeling

Objectives of Data Preparation

Before running the ecological niche model (ENM), we must:
Load and preprocess environmental raster data (NetCDF format).
Compute average climate conditions across multiple time periods.
Remove problematic variables (NA values, zero-inflated layers).
Standardize and scale climate predictors for modeling consistency.
Mask climate data to the study region for spatial relevance.


Workflow Overview

1️⃣ Load Environmental Raster Data

  • Import climate variables stored in NetCDF format for different time periods (2000s & 2010s).

  • Use an automated function to streamline raster loading.

2️⃣ Process and Aggregate Climate Variables

  • Compute the mean environmental conditions across time periods.

  • Create a single environmental dataset representing present-day conditions.

3️⃣ Mask Climate Data to the Marine Region

  • Ensure that environmental predictors align with Cholera’s habitat (marine regions).

  • Exclude land-based environmental layers to maintain model accuracy.

4️⃣ Standardize and Scale Raster Data

  • Remove correlated or problematic variables to avoid model bias.

  • Normalize climate variables to a common scale before analysis.

# Load necessary libraries for spatial and data manipulation
library(terra)      # Handling raster data
library(tidyverse)  # Data manipulation
library(ncdf4)      # NetCDF file handling
library(maps)       # Mapping support

Load Environmental Data

Define a Function to Load Rasters

To streamline the process, we define a function that reads NetCDF files for different variables.

# Function to load raster data
load_raster <- function(var, decade) {
  path <- sprintf("Cholera/Climate/%ss/%s_%s.nc", 
                  decade, 
                  var, 
                  decade)
  rast(path)
}

Load Rasters Efficiently

Instead of manually loading each file, we use a list approach.

# Define environmental variables with full names as comments
variables <- c("so",        # Salinity (-)
               "thetao",    # Ocean Temperature (°C)
               "chl",       # Chlorophyll (mg.m⁻³)
               "ph",        # pH (-)
               "clt",       # Cloud Cover (%)
               "dfe",       # Dissolved Molecular Iron (mmol.m⁻³)
               "kdpar",     # Diffuse Attenuation (m⁻¹)
               "mlotst",    # Mixed Layer Depth (m)
               "no3",       # Nitrate (mmol.m⁻³)
               "o2",        # Dissolved Molecular Oxygen (mmol.m⁻³)
               "par",       # Photosynthetically Available Radiation (E.m⁻².day⁻¹)
               "phyc",      # Primary Productivity; Total Phytoplankton (mmol.m⁻³)
               "po4",       # Phosphate (mmol.m⁻³)
               "si",        # Silicate (mmol.m⁻³)
               "siconc",    # Sea Ice Cover (Fraction)
               "sithick",   # Sea Ice Thickness (m)
               "swd",       # Sea Water Direction (degree)
               "sws",       # Sea Water Speed (m.s⁻¹)
               "tas",       # Air Temperature (°C)
               "terrain")   # Bathymetry (m)

# Define time periods
time_periods <- c("2000s", 
                  "2010s")

# Function to load raster data for each variable and period
load_raster <- function(var, period, stat) {
  path <- sprintf("Cholera/Climate/%s/%s_%s_%s.nc", 
                  period, 
                  var, 
                  stat, 
                  period)
  rast(path)
}

# Load rasters for each variable, period, and statistic
rasters <- list()
for (var in variables) {
  for (period in time_periods) {
    rasters[[paste0(var, "_mean_", period)]] <- load_raster(var, period, "mean")
  }
}

Compute Mean Climate Variables for the Present Period

mean_rasters <- list()
for (var in variables) {
  mean_rasters[[paste0(var, "_present")]] <- mean(
    rasters[[paste0(var, "_mean_2000s")]], rasters[[paste0(var, "_mean_2010s")]]
  )
}

# Combine all present-day climate variables into a single stack
current_clim <- rast(mean_rasters)
names(current_clim) <- paste0(variables)

# Plot current climate layers
plot(current_clim, main = names(current_clim))

# Save climate data
writeCDF(current_clim, "Cholera/Climate/Current/current_clim.nc", split = TRUE, overwrite = TRUE)

Load and Process Study Region

# Load marine region shapefile
m_region <- vect("Cholera/M_region/EEZ_IHO_v3_coastal.shp")

# Dissolve into a single polygon
m_region <- aggregate(m_region, 
                      dissolve = TRUE)

# Save cleaned shapefile
writeVector(m_region, "Cholera/M_region/mask_m/m_coastal.shp", overwrite = TRUE)
## Warning in x@pntr$write(filename, layer, filetype, insert[1], overwrite[1], :
## GDAL Message 6: dataset Cholera/M_region/mask_m/m_coastal.shp does not
## support layer creation option OVERWRITE
# Reload for consistency
m_region <- vect("Cholera/M_region/mask_m/m_coastal.shp")

# Assign coordinate reference system (CRS)
crs(m_region) <- "epsg:4326"
crs(current_clim) <- "epsg:4326"

# Plot marine region
plot(m_region)
map("world", add = TRUE)

Load and Process Cholera Occurrence Data

# Load cholera occurrence dataset
cholera <- read_csv("Cholera/data/Occurrence_original.csv")
## Rows: 1907 Columns: 6
## ── Column specification ──────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, Citation, References
## dbl (3): x, y, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert occurrence data to spatial vector
cholera_vec <- vect(cholera, 
                    geom = c("x", "y"), 
                    crs = "+proj=longlat +datum=WGS84 +no_defs")

# Crop cholera points to study region
cholera_vec_crop <- terra::crop(cholera_vec, 
                                m_region)

# Convert cropped vector to dataframe
cholera_dat_crop <- as.data.frame(cholera_vec_crop, 
                                  geom = "XY")

# Save cropped occurrence data
write.csv(cholera_dat_crop, "Cholera/data/Occurrence_crop.csv")

# Plot occurrence data
plot(cholera_vec, col = "green")
plot(cholera_vec_crop, add = TRUE)
map("world", add = TRUE)

Mask Climate Data to Study Region

# Crop and mask climate data to the marine region
clim_present_masked <- terra::crop(current_clim, 
                                   m_region,
                                   mask = TRUE)

# Plot masked climate data
plot(clim_present_masked, main = names(clim_present_masked))

# Save masked climate data
writeCDF(clim_present_masked, "Cholera/Climate/Mask/current_clim_m.nc", split = TRUE, overwrite = TRUE)

Processing future climate projection data under two Shared Socioeconomic Pathways (SSPs) for 2050–2060:

  • SSP119: Low-emission scenario
  • SSP585: High-emission scenario

Define Climate Scenarios and Variables

# Define future climate scenarios
scenarios <- c("ssp119", "ssp585")

# Define the environmental variables to be used
variables <- c("so", 
               "thetao", 
               "chl", 
               "ph", 
               "clt",
               "dfe",
               "mlotst",
               "no3", 
               "o2",
               "phyc", 
               "po4", 
               "si", 
               "siconc",
               "sithick", 
               "swd", 
               "sws", 
               "tas")

Load Future Climate Rasters

# Function to load raster files for a given scenario
load_future_rasters <- function(scenario, variables) {
  raster_list <- list()
  
  for (var in variables) {
    file_path <- sprintf("Cholera/Climate/Future_2050_2060/%s/%s_%s.nc", scenario, var, scenario)
    
    # Check if file exists before loading
    if (file.exists(file_path)) {
      raster_list[[var]] <- rast(file_path)
    } else {
      message(sprintf("Warning: File not found - %s", file_path))
    }
  }
  
  return(raster_list)
}

# Load future climate variables for both SSP scenarios
future_clim_ssp119 <- load_future_rasters("ssp119", variables)
future_clim_ssp585 <- load_future_rasters("ssp585", variables)

Convert Raster Lists to SpatRaster Objects

# Convert lists to SpatRaster objects
future_clim_ssp119_2050s <- rast(future_clim_ssp119)
future_clim_ssp585_2050s <- rast(future_clim_ssp585)

Visualize Future Climate Rasters

# Plot SSP119 future climate raster
plot(future_clim_ssp119_2050s, main = names(future_clim_ssp119_2050s))

# Plot SSP585 future climate raster
plot(future_clim_ssp585_2050s, main = names(future_clim_ssp585_2050s))

Save Processed Future Climate Data

# Save processed future climate rasters to NetCDF format
writeCDF(future_clim_ssp119_2050s,
         "Cholera/Climate/Future_2050_2060/future_clim_ssp119_2050s.nc", 
         split = TRUE,
         overwrite = TRUE)

writeCDF(future_clim_ssp585_2050s,
         "Cholera/Climate/Future_2050_2060/future_clim_ssp585_2050s.nc", 
         split = TRUE,
         overwrite = TRUE)