I. Data Preparation and Environment Setup

Installation and Loading of Required Packages

if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  raster, terra, dplyr, tidyr, purrr, sf, dismo, sdm, ggplot2, usdm, httr, archive, readr, leaflet, htmltools
)

Reading Species Occurrence Data

species_occurrence <- read_csv(
  "https://raw.githubusercontent.com/quyforest/Pterygoplichthys_pardalis/main/svg_vn.csv",
  show_col_types = FALSE
)

points_vect <- terra::vect(
  species_occurrence,
  geom = c("lon", "lat"),
  crs = 'epsg:4326'
)

Downloading and Extracting Environmental Variables

url <- "https://github.com/quyforest/Pterygoplichthys_pardalis/raw/main/current_vn.rar"
destfile <- "current_vn.rar"
httr::GET(url, httr::write_disk(destfile, overwrite = TRUE))
## Response [https://raw.githubusercontent.com/quyforest/Pterygoplichthys_pardalis/main/current_vn.rar]
##   Date: 2025-11-11 01:40
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 13.6 MB
## <ON DISK>  C:\Users\Dell 3530\OneDrive\Máy tính\ENMTML-master\ENMTML-master\vignettes\current_vn.rar
archive_files <- archive::archive(destfile)
asc_files <- archive_files$path[grepl("\\.asc$", archive_files$path)]
if (!dir.exists("./asc_files")) dir.create("./asc_files")
archive::archive_extract(destfile, files = asc_files, dir = "./asc_files")
asc_paths <- list.files("./asc_files", pattern = "\\.asc$", full.names = TRUE, recursive = TRUE)

Reading and Processing Environmental Raster Layers

currentEnv <- terra::rast(asc_paths)
names(currentEnv) <- paste0("bio", 1:19)

II. Environmental Data Preprocessing

Eliminating Multicollinearity Using Variance Inflation Factor (VIF)

set.seed(123)
sample_df <- terra::spatSample(currentEnv, size = 3000, na.rm = TRUE, as.df = TRUE)
v <- usdm::vifstep(sample_df, th = 5)
selected_variables <- v@results$Variables
currentEnv_selected <- terra::subset(currentEnv, selected_variables)

III. Building the Species Distribution Model with MaxEnt

Preparing Input Data for Modeling

d <- sdmData(species ~ ., points_vect,
             predictors = currentEnv_selected,
             bg = list(method = "gRandom", n = 500))
d
## class                                 : sdmdata 
## =========================================================== 
## number of species                     :  1 
## species names                         :  species 
## number of features                    :  7 
## feature names                         :  bio2, bio8, bio12, ... 
## type                                  :  Presence-Background 
## has independent test data?             :  FALSE 
## number of records                     :  516 
## has Coordinates?                      :  TRUE

Training the MaxEnt Model with Replications

m <- sdm(species ~ ., d, methods = c("maxent"),
         replication = c("sub", "boot"), test.p = 30, n = 3,
         parallelSetting = list(cores = 10, method = 'parallel'))
m
## class                                 : sdmModels 
## ======================================================== 
## number of species                     :  1 
## number of modelling methods           :  1 
## names of modelling methods            :  maxent 
## replicate.methods (data partitioning) :  subsampling,bootstrap 
## number of replicates (each method)    :  3 
## toral number of replicates per model  :  6 (per species) 
## test percentage (in subsampling)      :  30 
## ------------------------------------------
## model run success percentage (per species)  :
## ------------------------------------------
## method          species          
## ---------------------- 
## maxent     :        100   %
## 
## ###################################################################
## model Mean performance (per species), using test dataset (generated using partitioning):
## -------------------------------------------------------------------------------
## 
##  ## species   :  species 
## =========================
## 
## methods    :     AUC     |     COR     |     TSS     |     Deviance 
## -------------------------------------------------------------------------
## maxent     :     0.92    |     0.38    |     0.77    |     0.44

IV. Prediction, Classification, and Visualization

Predicting Distribution and Exporting the Result Raster

en <- ensemble(m, currentEnv, filename = 'en.img',
                setting = list(method = "weighted",
                               stat = "tss", opt = 2), overwrite = TRUE)
en
## class       : SpatRaster 
## dimensions  : 1779, 878, 1  (nrow, ncol, nlyr)
## resolution  : 0.008333333, 0.008333333  (x, y)
## extent      : 102.1447, 109.4614, 8.566548, 23.39155  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (CRS84) (OGC:CRS84) 
## source      : en.img 
## name        : ensemble_weighted 
## min value   :       0.003084289 
## max value   :       0.999972701

Classifying Suitability Raster into Four Classes

classified_raster <- terra::classify(
  en,
  rcl = matrix(c(-Inf, 0.2, 1,
                 0.2, 0.4, 2,
                 0.4, 0.6, 3,
                 0.6, Inf, 4),
               ncol = 3, byrow = TRUE)
)

Calculating Area and Percentage of Each Suitability Class

cell_area <- terra::cellSize(classified_raster, unit = "km")
area_values <- terra::zonal(cell_area, classified_raster, fun = "sum", na.rm = TRUE)
colnames(area_values) <- c("Value", "Area_km2")

area_df <- data.frame(
  Value = 1:4,
  Class = c("Unsuitable", "Suitable", "Highly suitable", "Extremely suitable")
) %>%
  left_join(area_values, by = "Value") %>%
  mutate(
    Area_km2 = ifelse(is.na(Area_km2), 0, Area_km2)
  )
raster_land_area <- sum(area_df$Area_km2)
area_df <- area_df %>%
  mutate(
    Percentage = Area_km2 / raster_land_area * 100
  )
legend_labels <- paste0(
  area_df$Class, "\n(",
  format(round(area_df$Percentage, 2), nsmall = 1, decimal.mark = ","), "%)"
)
total_suitable_percent <- sum(area_df$Percentage[2:4])

Projecting Raster to WGS84 for Leaflet Visualization

r_leaf <- terra::project(classified_raster, "EPSG:4326")

Interactive Map of Predicted Potential Distribution

pal <- colorFactor(
  palette = c(
    "1" = "#440154",    # Unsuitable
    "2" = "#21918c",    # Suitable
    "3" = "#fde725",    # Highly suitable
    "4" = "#e41a1c"     # Extremely suitable
  ),
  domain = c(1,2,3,4)
)

leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>%
  addRasterImage(
    r_leaf,
    colors = pal,
    opacity = 0.7,
    project = FALSE
  ) %>%
  addLegend(
    pal = pal,
    values = c(1,2,3,4),
    title = "Legend",
    labFormat = function(type, cuts, p) { legend_labels },
    position = "bottomright"
  ) %>%
  addControl(
    html = htmltools::tags$div(
      HTML(
        paste0(
          "<b>Potential distribution zones of Pterygoplichthys pardalis in Vietnam</b><br>",
          "<span style='color:gray;'>Model: MaxEnt</span><br>",
          "The total area of suitable zones (including suitable, highly suitable and extremely suitable) accounts for <b>",
          format(round(total_suitable_percent, 2), nsmall = 1, decimal.mark = ","),
          "%</b> of Vietnam's land area.<br>",
          "<span style='color:red; font-style:italic;'>Note: Paracel and Spratly Islands are under Vietnam's sovereignty.</span>"
        )
      )
    ),
    position = "topright"
  )

V. Model Evaluation

ROC Curve

# Calculate ROC curve data
.calculate_roc_data <- function(observed, predicted) {
  th <- quantile(predicted, probs = seq(0, 1, length.out = 101), na.rm = TRUE)
  roc_data <- purrr::map_dfr(th, function(t) {
    predicted_class <- as.integer(predicted >= t)
    cm <- table(factor(predicted_class, levels = 0:1), 
                factor(observed, levels = 0:1))
    sensitivity <- cm[2,2] / sum(cm[,2])
    specificity <- cm[1,1] / sum(cm[,1])
    data.frame(threshold = t, sensitivity = sensitivity, specificity = specificity)
  })
  # Start at coordinate (0,0)
  roc_data <- dplyr::bind_rows(
    data.frame(threshold = NA, sensitivity = 0, specificity = 1),
    roc_data
  )
  roc_data
}

# Smooth ROC curve
.smooth_roc <- function(roc_data) {
  tryCatch({
    smoothed <- supsmu(1 - roc_data$specificity, roc_data$sensitivity, bass = 0)
    if (smoothed$y[1] != 0) {
      smoothed$x <- c(0, smoothed$x)
      smoothed$y <- c(0, smoothed$y)
    }
    data.frame(fpr = smoothed$x, sensitivity = smoothed$y)
  }, error = function(e) {
    data.frame(fpr = 1 - roc_data$specificity, sensitivity = roc_data$sensitivity)
  })
}

# Main ROC plotting function using ggplot2
ggroc <- function(x, p = NULL, species = NULL, method = NULL, 
                  replication = NULL, run = NULL, wtest = NULL, 
                  smooth = FALSE, legend = TRUE, ...) {

  dot_args <- list(...)
  if (inherits(x, "sdmModels")) {
    mi <- sdm::getModelInfo(x, species = species, method = method, 
                            replication = replication, run = run)
    if (is.null(wtest)) {
      wtest <- names(x@models[[1]][[1]][[1]]@evaluation)
    } else {
      wtest <- match.arg(wtest, c("training", "test.dep", "test.indep"), several.ok = TRUE)
      if (length(wtest) > 2) wtest <- wtest[1:2]
    }
    roc_data <- purrr::map(wtest, function(wt) {
      purrr::map(mi$modelID, function(id) {
        eval_data <- x@models[[mi$species[1]]][[mi$method[1]]][[as.character(id)]]@evaluation[[wt]]
        if (inherits(eval_data, "sdmEvaluate")) {
          .calculate_roc_data(eval_data@observed, eval_data@predicted)
        } else NULL
      }) %>% purrr::compact()
    }) %>% rlang::set_names(wtest)
    auc_values <- purrr::map_dbl(wtest, function(wt) {
      mean(purrr::map_dbl(mi$modelID, function(id) {
        eval_data <- x@models[[mi$species[1]]][[mi$method[1]]][[as.character(id)]]@evaluation[[wt]]
        if (inherits(eval_data, "sdmEvaluate")) {
          eval_data@statistics$AUC
        } else NA
      }), na.rm = TRUE)
    })
    plot_data <- purrr::map2_dfr(roc_data, wtest, function(rd, wt) {
      purrr::map_dfr(rd, function(r) {
        if (smooth) {
          smoothed <- .smooth_roc(r)
          data.frame(
            fpr = smoothed$fpr,
            sensitivity = smoothed$sensitivity,
            test = wt,
            type = "smoothed"
          )
        } else {
          data.frame(
            fpr = 1 - r$specificity,
            sensitivity = r$sensitivity,
            test = wt,
            type = "raw"
          )
        }
      }, .id = "model")
    })

    roc_palette <- c("training" = "#21918c", "test.dep" = "#fde725", "test.indep" = "#e41a1c")
    p <- ggplot(plot_data, aes(x = fpr, y = sensitivity, color = test)) +
      geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#440154") +
      geom_line(aes(group = interaction(model, test)), alpha = 0.35) +
      stat_summary(aes(group = test), fun = mean, geom = "line", size = 1.7) +
      coord_equal() +
      scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
      scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
      labs(
        x = "1 - specificity (False Positive Rate)",
        y = "Sensitivity (True Positive Rate)",
        color = "Test Data"
      ) +
      scale_color_manual(values = roc_palette) +
      theme_minimal(base_size = 14) +
      theme(
        legend.position = if (legend) "bottom" else "none",
        panel.grid.minor = element_blank()
      )

    if (legend && !is.null(auc_values)) {
      auc_labels <- paste0(wtest, " (AUC = ", round(auc_values, 3), ")")
      p <- p + scale_color_manual(labels = auc_labels, values = roc_palette)
    }
    if (!is.null(dot_args$main)) {
      p <- p + ggtitle(dot_args$main)
    }
    return(p)

  } else if (is.numeric(x) && is.numeric(p)) {
    roc_data <- .calculate_roc_data(x, p)
    auc_value <- as.numeric(pROC::auc(x, p))
    if (smooth) {
      smoothed <- .smooth_roc(roc_data)
      plot_data <- data.frame(
        fpr = smoothed$fpr,
        sensitivity = smoothed$sensitivity
      )
    } else {
      plot_data <- data.frame(
        fpr = 1 - roc_data$specificity,
        sensitivity = roc_data$sensitivity
      )
    }
    p <- ggplot(plot_data, aes(x = fpr, y = sensitivity)) +
      geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "#440154") +
      geom_line(color = "#21918c", size = 1.3) +
      coord_equal() +
      scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
      scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
      labs(
        x = "1 - specificity (False Positive Rate)",
        y = "Sensitivity (True Positive Rate)"
      ) +
      theme_minimal(base_size = 14) +
      theme(panel.grid.minor = element_blank())
    if (legend) {
      p <- p + annotate("text", x = 0.7, y = 0.2, 
                        label = paste("AUC =", round(auc_value, 3)), size = 5, color = "#e41a1c")
    }
    if (!is.null(dot_args$main)) {
      p <- p + ggtitle(dot_args$main)
    }
    return(p)
  } else {
    stop("Invalid input. `x` must be sdmModels or a vector of observed values; `p` must be either a predicted vector or model ID.")
  }
}

ggroc(m, smooth = TRUE)  # ROC for all models (training & test)


Importance of Environmental Variables

library(ggplot2)
library(dplyr)
library(tidyr)
var_names <- c("bio2", "bio8", "bio12", "bio14", "bio15", "bio18", "bio19")
cor_values <- c(20.4, 6.3, 0.1, 9.4, 0.6, 76.3, 1.4)
auc_values <- c(13.2, 3.7, 0.1, 5, 1.3, 58.3, 0.3)
vi_data <- data.frame(
  variable = var_names,
  Correlation = cor_values,
  AUC = auc_values
)
vi_long <- vi_data %>%
  pivot_longer(
    cols = c("Correlation", "AUC"),
    names_to = "metric",
    values_to = "importance"
  )

ggplot(vi_long %>% filter(metric == "AUC"),
       aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#21918c", width = 0.7) +
  labs(title = "Environmental Variable Importance (AUC)",
       x = "Variable",
       y = "Contribution (%)") +
  coord_flip() +
  theme_minimal(base_size = 14)

ggplot(vi_long %>% filter(metric == "Correlation"),
       aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#e41a1c", width = 0.7) +
  labs(title = "Environmental Variable Importance (Correlation)",
       x = "Variable",
       y = "Contribution (%)") +
  coord_flip() +
  theme_minimal(base_size = 14)


Response Curves for Key Bioclimatic Variables

rcurve(m, n = c("bio14", "bio15", "bio18"), smooth = TRUE)
## The id argument is not specified; The modelIDs of 6 successfully fitted models are assigned to id...!

rcurve(m, n = c("bio19", "bio2", "bio8"), smooth = TRUE)
## The id argument is not specified; The modelIDs of 6 successfully fitted models are assigned to id...!


References