I. Data Preparation and Environment Setup

Installation and Loading of Required Packages

if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  raster, terra, dplyr, tidyr, purrr, sf, dismo, sdm, ggplot2, usdm, httr, archive, readr, leaflet, htmltools
)

Reading Species Occurrence Data

species_occurrence <- read_csv(
  "https://raw.githubusercontent.com/quyforest/Pterygoplichthys_pardalis/main/svg_vn.csv",
  show_col_types = FALSE
)

points_vect <- terra::vect(
  species_occurrence,
  geom = c("lon", "lat"),
  crs = 'epsg:4326'
)

Downloading and Extracting Environmental Variables

url <- "https://github.com/quyforest/Pterygoplichthys_pardalis/raw/main/current_vn.rar"
destfile <- "current_vn.rar"
httr::GET(url, httr::write_disk(destfile, overwrite = TRUE))

## Response [https://raw.githubusercontent.com/quyforest/Pterygoplichthys_pardalis/main/current_vn.rar]
##   Date: 2025-11-11 01:40
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 13.6 MB
## <ON DISK>  C:\Users\Dell 3530\OneDrive\Máy tính\ENMTML-master\ENMTML-master\vignettes\current_vn.rar

archive_files <- archive::archive(destfile)
asc_files <- archive_files$path[grepl("\\.asc$", archive_files$path)]
if (!dir.exists("./asc_files")) dir.create("./asc_files")
archive::archive_extract(destfile, files = asc_files, dir = "./asc_files")
asc_paths <- list.files("./asc_files", pattern = "\\.asc$", full.names = TRUE, recursive = TRUE)

Reading and Processing Environmental Raster Layers

currentEnv <- terra::rast(asc_paths)
names(currentEnv) <- paste0("bio", 1:19)

II. Environmental Data Preprocessing

Eliminating Multicollinearity Using Variance Inflation Factor (VIF)

set.seed(123)
sample_df <- terra::spatSample(currentEnv, size = 3000, na.rm = TRUE, as.df = TRUE)
v <- usdm::vifstep(sample_df, th = 5)
selected_variables <- v@results$Variables
currentEnv_selected <- terra::subset(currentEnv, selected_variables)

III. Building the Species Distribution Model with MaxEnt

Preparing Input Data for Modeling

d <- sdmData(species ~ ., points_vect,
             predictors = currentEnv_selected,
             bg = list(method = "gRandom", n = 500))
d

## class                                 : sdmdata 
## =========================================================== 
## number of species                     :  1 
## species names                         :  species 
## number of features                    :  7 
## feature names                         :  bio2, bio8, bio12, ... 
## type                                  :  Presence-Background 
## has independent test data?             :  FALSE 
## number of records                     :  516 
## has Coordinates?                      :  TRUE

Training the MaxEnt Model with Replications

m <- sdm(species ~ ., d, methods = c("maxent"),
         replication = c("sub", "boot"), test.p = 30, n = 3,
         parallelSetting = list(cores = 10, method = 'parallel'))
m

## class                                 : sdmModels 
## ======================================================== 
## number of species                     :  1 
## number of modelling methods           :  1 
## names of modelling methods            :  maxent 
## replicate.methods (data partitioning) :  subsampling,bootstrap 
## number of replicates (each method)    :  3 
## toral number of replicates per model  :  6 (per species) 
## test percentage (in subsampling)      :  30 
## ------------------------------------------
## model run success percentage (per species)  :
## ------------------------------------------
## method          species          
## ---------------------- 
## maxent     :        100   %
## 
## ###################################################################
## model Mean performance (per species), using test dataset (generated using partitioning):
## -------------------------------------------------------------------------------
## 
##  ## species   :  species 
## =========================
## 
## methods    :     AUC     |     COR     |     TSS     |     Deviance 
## -------------------------------------------------------------------------
## maxent     :     0.92    |     0.38    |     0.77    |     0.44

IV. Prediction, Classification, and Visualization

Predicting Distribution and Exporting the Result Raster

en <- ensemble(m, currentEnv, filename = 'en.img',
                setting = list(method = "weighted",
                               stat = "tss", opt = 2), overwrite = TRUE)
en

## class       : SpatRaster 
## dimensions  : 1779, 878, 1  (nrow, ncol, nlyr)
## resolution  : 0.008333333, 0.008333333  (x, y)
## extent      : 102.1447, 109.4614, 8.566548, 23.39155  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (CRS84) (OGC:CRS84) 
## source      : en.img 
## name        : ensemble_weighted 
## min value   :       0.003084289 
## max value   :       0.999972701

Classifying Suitability Raster into Four Classes

classified_raster <- terra::classify(
  en,
  rcl = matrix(c(-Inf, 0.2, 1,
                 0.2, 0.4, 2,
                 0.4, 0.6, 3,
                 0.6, Inf, 4),
               ncol = 3, byrow = TRUE)
)

Calculating Area and Percentage of Each Suitability Class

cell_area <- terra::cellSize(classified_raster, unit = "km")
area_values <- terra::zonal(cell_area, classified_raster, fun = "sum", na.rm = TRUE)
colnames(area_values) <- c("Value", "Area_km2")

area_df <- data.frame(
  Value = 1:4,
  Class = c("Unsuitable", "Suitable", "Highly suitable", "Extremely suitable")
) %>%
  left_join(area_values, by = "Value") %>%
  mutate(
    Area_km2 = ifelse(is.na(Area_km2), 0, Area_km2)
  )
raster_land_area <- sum(area_df$Area_km2)
area_df <- area_df %>%
  mutate(
    Percentage = Area_km2 / raster_land_area * 100
  )
legend_labels <- paste0(
  area_df$Class, "\n(",
  format(round(area_df$Percentage, 2), nsmall = 1, decimal.mark = ","), "%)"
)
total_suitable_percent <- sum(area_df$Percentage[2:4])

Projecting Raster to WGS84 for Leaflet Visualization

r_leaf <- terra::project(classified_raster, "EPSG:4326")

Interactive Map of Predicted Potential Distribution

pal <- colorFactor(
  palette = c(
    "1" = "#440154",    # Unsuitable
    "2" = "#21918c",    # Suitable
    "3" = "#fde725",    # Highly suitable
    "4" = "#e41a1c"     # Extremely suitable
  ),
  domain = c(1,2,3,4)
)

leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>%
  addRasterImage(
    r_leaf,
    colors = pal,
    opacity = 0.7,
    project = FALSE
  ) %>%
  addLegend(
    pal = pal,
    values = c(1,2,3,4),
    title = "Legend",
    labFormat = function(type, cuts, p) { legend_labels },
    position = "bottomright"
  ) %>%
  addControl(
    html = htmltools::tags$div(
      HTML(
        paste0(
          "<b>Potential distribution zones of Pterygoplichthys pardalis in Vietnam</b><br>",
          "<span style='color:gray;'>Model: MaxEnt</span><br>",
          "The total area of suitable zones (including suitable, highly suitable and extremely suitable) accounts for <b>",
          format(round(total_suitable_percent, 2), nsmall = 1, decimal.mark = ","),
          "%</b> of Vietnam's land area.<br>",
          "<span style='color:red; font-style:italic;'>Note: Paracel and Spratly Islands are under Vietnam's sovereignty.</span>"
        )
      )
    ),
    position = "topright"
  )

V. Model Evaluation

ROC Curve

# Calculate ROC curve data
.calculate_roc_data <- function(observed, predicted) {
  th <- quantile(predicted, probs = seq(0, 1, length.out = 101), na.rm = TRUE)
  roc_data <- purrr::map_dfr(th, function(t) {
    predicted_class <- as.integer(predicted >= t)
    cm <- table(factor(predicted_class, levels = 0:1), 
                factor(observed, levels = 0:1))
    sensitivity <- cm[2,2] / sum(cm[,2])
    specificity <- cm[1,1] / sum(cm[,1])
    data.frame(threshold = t, sensitivity = sensitivity, specificity = specificity)
  })
  # Start at coordinate (0,0)
  roc_data <- dplyr::bind_rows(
    data.frame(threshold = NA, sensitivity = 0, specificity = 1),
    roc_data
  )
  roc_data
}

# Smooth ROC curve
.smooth_roc <- function(roc_data) {
  tryCatch({
    smoothed <- supsmu(1 - roc_data$specificity, roc_data$sensitivity, bass = 0)
    if (smoothed$y[1] != 0) {
      smoothed$x <- c(0, smoothed$x)
      smoothed$y <- c(0, smoothed$y)
    }
    data.frame(fpr = smoothed$x, sensitivity = smoothed$y)
  }, error = function(e) {
    data.frame(fpr = 1 - roc_data$specificity, sensitivity = roc_data$sensitivity)
  })
}

# Main ROC plotting function using ggplot2
ggroc <- function(x, p = NULL, species = NULL, method = NULL, 
                  replication = NULL, run = NULL, wtest = NULL, 
                  smooth = FALSE, legend = TRUE, ...) {

  dot_args <- list(...)
  if (inherits(x, "sdmModels")) {
    mi <- sdm::getModelInfo(x, species = species, method = method, 
                            replication = replication, run = run)
    if (is.null(wtest)) {
      wtest <- names(x@models[[1]][[1]][[1]]@evaluation)
    } else {
      wtest <- match.arg(wtest, c("training", "test.dep", "test.indep"), several.ok = TRUE)
      if (length(wtest) > 2) wtest <- wtest[1:2]
    }
    roc_data <- purrr::map(wtest, function(wt) {
      purrr::map(mi$modelID, function(id) {
        eval_data <- x@models[[mi$species[1]]][[mi$method[1]]][[as.character(id)]]@evaluation[[wt]]
        if (inherits(eval_data, "sdmEvaluate")) {
          .calculate_roc_data(eval_data@observed, eval_data@predicted)
        } else NULL
      }) %>% purrr::compact()
    }) %>% rlang::set_names(wtest)
    auc_values <- purrr::map_dbl(wtest, function(wt) {
      mean(purrr::map_dbl(mi$modelID, function(id) {
        eval_data <- x@models[[mi$species[1]]][[mi$method[1]]][[as.character(id)]]@evaluation[[wt]]
        if (inherits(eval_data, "sdmEvaluate")) {
          eval_data@statistics$AUC
        } else NA
      }), na.rm = TRUE)
    })
    plot_data <- purrr::map2_dfr(roc_data, wtest, function(rd, wt) {
      purrr::map_dfr(rd, function(r) {
        if (smooth) {
          smoothed <- .smooth_roc(r)
          data.frame(
            fpr = smoothed$fpr,
            sensitivity = smoothed$sensitivity,
            test = wt,
            type = "smoothed"
          )
        } else {
          data.frame(
            fpr = 1 - r$specificity,
            sensitivity = r$sensitivity,
            test = wt,
            type = "raw"
          )
        }
      }, .id = "model")
    })

    roc_palette <- c("training" = "#21918c", "test.dep" = "#fde725", "test.indep" = "#e41a1c")
    p <- ggplot(plot_data, aes(x = fpr, y = sensitivity, color = test)) +
      geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#440154") +
      geom_line(aes(group = interaction(model, test)), alpha = 0.35) +
      stat_summary(aes(group = test), fun = mean, geom = "line", size = 1.7) +
      coord_equal() +
      scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
      scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
      labs(
        x = "1 - specificity (False Positive Rate)",
        y = "Sensitivity (True Positive Rate)",
        color = "Test Data"
      ) +
      scale_color_manual(values = roc_palette) +
      theme_minimal(base_size = 14) +
      theme(
        legend.position = if (legend) "bottom" else "none",
        panel.grid.minor = element_blank()
      )

    if (legend && !is.null(auc_values)) {
      auc_labels <- paste0(wtest, " (AUC = ", round(auc_values, 3), ")")
      p <- p + scale_color_manual(labels = auc_labels, values = roc_palette)
    }
    if (!is.null(dot_args$main)) {
      p <- p + ggtitle(dot_args$main)
    }
    return(p)

  } else if (is.numeric(x) && is.numeric(p)) {
    roc_data <- .calculate_roc_data(x, p)
    auc_value <- as.numeric(pROC::auc(x, p))
    if (smooth) {
      smoothed <- .smooth_roc(roc_data)
      plot_data <- data.frame(
        fpr = smoothed$fpr,
        sensitivity = smoothed$sensitivity
      )
    } else {
      plot_data <- data.frame(
        fpr = 1 - roc_data$specificity,
        sensitivity = roc_data$sensitivity
      )
    }
    p <- ggplot(plot_data, aes(x = fpr, y = sensitivity)) +
      geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#440154") +
      geom_line(color = "#21918c", size = 1.3) +
      coord_equal() +
      scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
      scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
      labs(
        x = "1 - specificity (False Positive Rate)",
        y = "Sensitivity (True Positive Rate)"
      ) +
      theme_minimal(base_size = 14) +
      theme(panel.grid.minor = element_blank())
    if (legend) {
      p <- p + annotate("text", x = 0.7, y = 0.2, 
                        label = paste("AUC =", round(auc_value, 3)), size = 5, color = "#e41a1c")
    }
    if (!is.null(dot_args$main)) {
      p <- p + ggtitle(dot_args$main)
    }
    return(p)
  } else {
    stop("Invalid input. `x` must be sdmModels or a vector of observed values; `p` must be either a predicted vector or model ID.")
  }
}

ggroc(m, smooth = TRUE)  # ROC for all models (training & test)

Importance of Environmental Variables

library(ggplot2)
library(dplyr)
library(tidyr)
var_names <- c("bio2", "bio8", "bio12", "bio14", "bio15", "bio18", "bio19")
cor_values <- c(20.4, 6.3, 0.1, 9.4, 0.6, 76.3, 1.4)
auc_values <- c(13.2, 3.7, 0.1, 5, 1.3, 58.3, 0.3)
vi_data <- data.frame(
  variable = var_names,
  Correlation = cor_values,
  AUC = auc_values
)
vi_long <- vi_data %>%
  pivot_longer(
    cols = c("Correlation", "AUC"),
    names_to = "metric",
    values_to = "importance"
  )

ggplot(vi_long %>% filter(metric == "AUC"),
       aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#21918c", width = 0.7) +
  labs(title = "Environmental Variable Importance (AUC)",
       x = "Variable",
       y = "Contribution (%)") +
  coord_flip() +
  theme_minimal(base_size = 14)

ggplot(vi_long %>% filter(metric == "Correlation"),
       aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#e41a1c", width = 0.7) +
  labs(title = "Environmental Variable Importance (Correlation)",
       x = "Variable",
       y = "Contribution (%)") +
  coord_flip() +
  theme_minimal(base_size = 14)

Response Curves for Key Bioclimatic Variables

rcurve(m, n = c("bio14", "bio15", "bio18"), smooth = TRUE)

## The id argument is not specified; The modelIDs of 6 successfully fitted models are assigned to id...!

rcurve(m, n = c("bio19", "bio2", "bio8"), smooth = TRUE)

## The id argument is not specified; The modelIDs of 6 successfully fitted models are assigned to id...!

References

Phillips, S. J., Anderson, R. P., & Schapire, R. E. (2006). Maximum Entropy Modeling of Species Geographic Distributions. Ecological Modelling, 190(3-4), 231–259. https://www.sciencedirect.com/science/article/pii/S030438000500267X
WorldClim Bioclimatic Rasters: https://www.worldclim.org/data/bioclim.html
R Packages Used: sdm, raster, terra, leaflet, ggplot2, usdm, etc.

Modeling the Potential Distribution of Pterygoplichthys pardalis (Amazon Sailfin Catfish) Using MaxEnt in R

Nguyen Van Quy

2025-11-11

I. Data Preparation and Environment Setup

Installation and Loading of Required Packages

Reading Species Occurrence Data

Downloading and Extracting Environmental Variables

Reading and Processing Environmental Raster Layers

II. Environmental Data Preprocessing

Eliminating Multicollinearity Using Variance Inflation Factor (VIF)

III. Building the Species Distribution Model with MaxEnt

Preparing Input Data for Modeling

Training the MaxEnt Model with Replications

IV. Prediction, Classification, and Visualization

Predicting Distribution and Exporting the Result Raster

Classifying Suitability Raster into Four Classes

Calculating Area and Percentage of Each Suitability Class

Projecting Raster to WGS84 for Leaflet Visualization

Interactive Map of Predicted Potential Distribution

V. Model Evaluation

ROC Curve

Importance of Environmental Variables

Response Curves for Key Bioclimatic Variables

References