if (!require("rspat")) remotes::install_github("rspatial/rspat") ###Loading required package: rspat ###Loading required package: terra ###terra 1.7.62
## Loading required package: rspat
## Loading required package: terra
## terra 1.7.83
if (!require("predicts")) install.packages("predicts") ###Loading required package: predicts
## Loading required package: predicts
if (!require("geodata")) install.packages("geodata") ###Loading required package: geodata
## Loading required package: geodata
library(terra) # Spatial data analysis
library(rspat) # Spatial distribution modeling
bf <- spat_data("bigfoot") # Load dataset for analysis
dim(bf) # Get data dimensions
## [1] 3092 3
head(bf) # Preview first few rows of data
## lon lat Class
## 1 -142.9000 61.50000 A
## 2 -132.7982 55.18720 A
## 3 -132.8202 55.20350 A
## 4 -141.5667 62.93750 A
## 5 -149.7853 61.05950 A
## 6 -141.3165 62.77335 A
plot(bf[,1:2], cex=0.5, col="red") # Plot species occurrence points
library(geodata) # Load geodata
wrld <- geodata::world(path=".") # Load world boundaries
bnds <- wrld[wrld$NAME_0 %in%
c("Canada", "Mexico", "United States"), ]
lines(bnds) # Overlay boundaries
- Purpose: Visualize species occurrence in specific regions. -
Interpretation: Map with red points showing occurrences and
boundaries for selected countries. - Use: Explore spatial
distribution and regional context of data.
wc <- geodata::worldclim_global("bio", res=10, ".") # Load climate data
plot(wc[[c(1, 12)]], nr=2) # Plot temperature and precipitation
- Purpose: Visualize global climate data. -
Interpretation: Displays two climate variables (temperature and
precipitation) globally. - Use: Helps understand environmental
variables relevant to species distribution.
bfc <- extract(wc, bf[,1:2]) # Extract climate data for species occurrence points
head(bfc, 3) # Preview the first 3 rows
## ID wc2.1_10m_bio_1 wc2.1_10m_bio_2 wc2.1_10m_bio_3 wc2.1_10m_bio_4
## 1 1 -1.832979 12.504708 28.95899 1152.4308
## 2 2 6.360650 5.865935 32.27475 462.5731
## 3 3 6.360650 5.865935 32.27475 462.5731
## wc2.1_10m_bio_5 wc2.1_10m_bio_6 wc2.1_10m_bio_7 wc2.1_10m_bio_8
## 1 20.34075 -22.840000 43.18075 5.327750
## 2 16.65505 -1.519947 18.17500 3.964495
## 3 16.65505 -1.519947 18.17500 3.964495
## wc2.1_10m_bio_9 wc2.1_10m_bio_10 wc2.1_10m_bio_11 wc2.1_10m_bio_12
## 1 -0.6887083 11.80792 -16.038542 991
## 2 10.4428196 12.28183 1.467686 3079
## 3 10.4428196 12.28183 1.467686 3079
## wc2.1_10m_bio_13 wc2.1_10m_bio_14 wc2.1_10m_bio_15 wc2.1_10m_bio_16
## 1 120 42 31.32536 337
## 2 448 141 35.27518 1127
## 3 448 141 35.27518 1127
## wc2.1_10m_bio_17 wc2.1_10m_bio_18 wc2.1_10m_bio_19
## 1 157 288 216
## 2 468 630 873
## 3 468 630 873
bfc <- bfc[,-1] # Remove the ID column
plot(bfc[ ,"wc2.1_10m_bio_1"], bfc[, "wc2.1_10m_bio_12"], col="red", xlab="Annual mean temperature (.C)", ylab="Annual precipitation (mm)")
ext_bf <- ext(vect(bf[, 1:2])) + 1 # Create an extended spatial extent around occurrence points
ext_bf # Display the extent
## SpatExtent : -157.75, -63.4627, 24.141, 70.5 (xmin, xmax, ymin, ymax)
set.seed(0)
window(wc) <- ext_bf
bg <- spatSample(wc, 5000, "random", na.rm=TRUE, xy=TRUE)
head(bg)
## x y wc2.1_10m_bio_1 wc2.1_10m_bio_2 wc2.1_10m_bio_3
## 1 -99.2500 66.75000 -13.2934895 7.870646 14.96619
## 2 -106.0833 42.08333 5.6722708 14.530958 36.82943
## 3 -111.9167 46.58333 6.7605939 14.135854 35.23372
## 4 -106.9167 54.75000 0.4086979 11.528605 24.43290
## 5 -118.2500 67.08333 -9.1363859 8.185354 16.34505
## 6 -111.2500 38.91667 8.4194584 15.997125 38.84047
## wc2.1_10m_bio_4 wc2.1_10m_bio_5 wc2.1_10m_bio_6 wc2.1_10m_bio_7
## 1 1638.6833 15.42850 -37.16100 52.58950
## 2 894.3715 27.86600 -11.58875 39.45475
## 3 927.7927 28.14375 -11.97650 40.12025
## 4 1290.1088 22.55225 -24.63250 47.18475
## 5 1567.0846 17.46575 -32.61275 50.07850
## 6 904.0610 30.49050 -10.69625 41.18675
## wc2.1_10m_bio_8 wc2.1_10m_bio_9 wc2.1_10m_bio_10 wc2.1_10m_bio_11
## 1 6.484917 -31.617332 7.518209 -31.76942
## 2 9.226916 -4.839750 17.168291 -4.83975
## 3 15.638333 -4.921750 18.186209 -4.92175
## 4 15.417084 -13.864500 15.417084 -16.31392
## 5 8.609292 -21.353209 10.573625 -27.49783
## 6 19.076958 3.179209 19.812834 -2.50475
## wc2.1_10m_bio_12 wc2.1_10m_bio_13 wc2.1_10m_bio_14 wc2.1_10m_bio_15
## 1 171 33 4 70.29919
## 2 288 42 13 38.78144
## 3 293 48 9 53.40759
## 4 471 86 16 58.32499
## 5 223 43 7 61.21693
## 6 228 28 11 32.40370
## wc2.1_10m_bio_16 wc2.1_10m_bio_17 wc2.1_10m_bio_18 wc2.1_10m_bio_19
## 1 90 13 78 13
## 2 112 41 90 41
## 3 129 35 115 35
## 4 220 53 220 56
## 5 108 27 93 29
## 6 83 40 72 44
plot(bg[, c("x", "y")])
- Purpose: Visualize the distribution of randomly sampled
points. - Interpretation: A scatter plot showing the locations
of sampled points. - Use: Confirms spatial coverage and
distribution of background data.
head(bg)
## x y wc2.1_10m_bio_1 wc2.1_10m_bio_2 wc2.1_10m_bio_3
## 1 -99.2500 66.75000 -13.2934895 7.870646 14.96619
## 2 -106.0833 42.08333 5.6722708 14.530958 36.82943
## 3 -111.9167 46.58333 6.7605939 14.135854 35.23372
## 4 -106.9167 54.75000 0.4086979 11.528605 24.43290
## 5 -118.2500 67.08333 -9.1363859 8.185354 16.34505
## 6 -111.2500 38.91667 8.4194584 15.997125 38.84047
## wc2.1_10m_bio_4 wc2.1_10m_bio_5 wc2.1_10m_bio_6 wc2.1_10m_bio_7
## 1 1638.6833 15.42850 -37.16100 52.58950
## 2 894.3715 27.86600 -11.58875 39.45475
## 3 927.7927 28.14375 -11.97650 40.12025
## 4 1290.1088 22.55225 -24.63250 47.18475
## 5 1567.0846 17.46575 -32.61275 50.07850
## 6 904.0610 30.49050 -10.69625 41.18675
## wc2.1_10m_bio_8 wc2.1_10m_bio_9 wc2.1_10m_bio_10 wc2.1_10m_bio_11
## 1 6.484917 -31.617332 7.518209 -31.76942
## 2 9.226916 -4.839750 17.168291 -4.83975
## 3 15.638333 -4.921750 18.186209 -4.92175
## 4 15.417084 -13.864500 15.417084 -16.31392
## 5 8.609292 -21.353209 10.573625 -27.49783
## 6 19.076958 3.179209 19.812834 -2.50475
## wc2.1_10m_bio_12 wc2.1_10m_bio_13 wc2.1_10m_bio_14 wc2.1_10m_bio_15
## 1 171 33 4 70.29919
## 2 288 42 13 38.78144
## 3 293 48 9 53.40759
## 4 471 86 16 58.32499
## 5 223 43 7 61.21693
## 6 228 28 11 32.40370
## wc2.1_10m_bio_16 wc2.1_10m_bio_17 wc2.1_10m_bio_18 wc2.1_10m_bio_19
## 1 90 13 78 13
## 2 112 41 90 41
## 3 129 35 115 35
## 4 220 53 220 56
## 5 108 27 93 29
## 6 83 40 72 44
bg <- bg[, -c(1:2)] # Remove coordinate columns (x, y) from the background data
head(bg)
## wc2.1_10m_bio_1 wc2.1_10m_bio_2 wc2.1_10m_bio_3 wc2.1_10m_bio_4
## 1 -13.2934895 7.870646 14.96619 1638.6833
## 2 5.6722708 14.530958 36.82943 894.3715
## 3 6.7605939 14.135854 35.23372 927.7927
## 4 0.4086979 11.528605 24.43290 1290.1088
## 5 -9.1363859 8.185354 16.34505 1567.0846
## 6 8.4194584 15.997125 38.84047 904.0610
## wc2.1_10m_bio_5 wc2.1_10m_bio_6 wc2.1_10m_bio_7 wc2.1_10m_bio_8
## 1 15.42850 -37.16100 52.58950 6.484917
## 2 27.86600 -11.58875 39.45475 9.226916
## 3 28.14375 -11.97650 40.12025 15.638333
## 4 22.55225 -24.63250 47.18475 15.417084
## 5 17.46575 -32.61275 50.07850 8.609292
## 6 30.49050 -10.69625 41.18675 19.076958
## wc2.1_10m_bio_9 wc2.1_10m_bio_10 wc2.1_10m_bio_11 wc2.1_10m_bio_12
## 1 -31.617332 7.518209 -31.76942 171
## 2 -4.839750 17.168291 -4.83975 288
## 3 -4.921750 18.186209 -4.92175 293
## 4 -13.864500 15.417084 -16.31392 471
## 5 -21.353209 10.573625 -27.49783 223
## 6 3.179209 19.812834 -2.50475 228
## wc2.1_10m_bio_13 wc2.1_10m_bio_14 wc2.1_10m_bio_15 wc2.1_10m_bio_16
## 1 33 4 70.29919 90
## 2 42 13 38.78144 112
## 3 48 9 53.40759 129
## 4 86 16 58.32499 220
## 5 43 7 61.21693 108
## 6 28 11 32.40370 83
## wc2.1_10m_bio_17 wc2.1_10m_bio_18 wc2.1_10m_bio_19
## 1 13 78 13
## 2 41 90 41
## 3 35 115 35
## 4 53 220 56
## 5 27 93 29
## 6 40 72 44
plot(bg[,1], bg[,12], # Create a scatter plot
xlab="Annual mean temperature (°C)", # Label x as "Annual mean temperature (°C)".
ylab="Annual precipitation (mm)", # Label y as "Annual precipitation (mm)".
cex=.8) # Adjust the size of the points to be slightly smaller (scale = 0.8).
points(bfc[,1], bfc[,12], # Overlay points from bfc dataset with x = bfc[,1] and y = bfc[,12].
col="red", # Use red color for these points.
cex=.6, # Make these points smaller (scale = 0.6).
pch="+") # Use a "+" shape for these points.
legend("topleft", # Place the legend at the top-left corner of the plot.
c("observed", "background"), # Add labels: "observed" and "background".
col=c("red", "black"), # Use red for "observed" and black for "background".
pch=c("+", "o"), # Match point shapes: "+" for "observed" and "o" for "background".
pt.cex=c(.6, .8)) # Match point sizes: 0.6 for "observed" and 0.8 for "background".
#eastern points
bfe <- bfc[bf[,1] > -102, ] # Select eastern species points (longitude > -102)
#western points
bfw <- bfc[bf[,1] <= -102, ] # Select western species points (longitude <= -102)
dw <- rbind(cbind(pa=1, bfw), cbind(pa=0, bg)) # Combine western points (presence) with background (absence)
de <- rbind(cbind(pa=1, bfe), cbind(pa=0, bg)) # Combine eastern points (presence) with background (absence)
dw <- data.frame(dw) # Convert western data to a data frame
de <- data.frame(na.omit(de)) # Convert eastern data to a data frame and remove missing values
dim(dw) # Output dimensions of western dataset
## [1] 6224 20
dim(de) # Output dimensions of eastern dataset
## [1] 6866 20
library(rpart) # Load the rpart library for CART modeling.
cart <- rpart(pa ~ ., data = dw) # Fit a classification and regression tree (CART) model to the western dataset (dw).
printcp(cart) # Print the complexity parameter table for the CART model.
##
## Regression tree:
## rpart(formula = pa ~ ., data = dw)
##
## Variables actually used in tree construction:
## [1] wc2.1_10m_bio_10 wc2.1_10m_bio_12 wc2.1_10m_bio_14 wc2.1_10m_bio_18
## [5] wc2.1_10m_bio_19 wc2.1_10m_bio_3 wc2.1_10m_bio_4
##
## Root node error: 983.29/6224 = 0.15798
##
## n= 6224
##
## CP nsplit rel error xerror xstd
## 1 0.322797 0 1.00000 1.00019 0.019351
## 2 0.080521 1 0.67720 0.68771 0.019687
## 3 0.073325 2 0.59668 0.59759 0.015480
## 4 0.068645 3 0.52336 0.52163 0.015231
## 5 0.027920 4 0.45471 0.47026 0.014758
## 6 0.014907 5 0.42679 0.44778 0.015136
## 7 0.010869 6 0.41188 0.44082 0.015406
## 8 0.010197 7 0.40102 0.43500 0.015330
## 9 0.010000 8 0.39082 0.43197 0.015319
CP
: Complexity
parameter controlling tree size. nsplit
: Number of splits
in the tree. xerror
: Cross-validated error for each model.
Smaller xerror
indicates better model performance.cp
value to avoid
overfitting.plotcp(cart) # Plot cross-validation error against the complexity parameter (cp).
cart <- rpart(pa ~ ., data = dw, cp = 0.02) # Refit CART using the chosen complexity parameter (cp = 0.02).
library(rpart.plot) # Load library for visualizing decision trees.
rpart.plot(cart, uniform = TRUE, main = "Regression Tree") # Plot the pruned regression tree.
Nodes:
Splits based on
predictor thresholds. -Leaf nodes:
Final predictions for
presence/absence.x <- predict(wc, cart) # Predict species suitability using the CART model.
x <- mask(x, wc[[1]]) # Mask predictions to limit them to the input climate data region.
x <- round(x, 2) # Round predictions to two decimal places for clarity.
plot(x, type = "class", plg = list(x = "bottomleft")) # Visualize predicted suitability.
set.seed(123) # Set random seed for reproducibility.
i <- sample(nrow(dw), 0.2 * nrow(dw)) # Randomly sample 20% of data for the test set.
test <- dw[i,] # Assign 20% of data to the test set.
train <- dw[-i,] # Use remaining 80% as the training set.
fpa <- as.factor(train[, 'pa']) # Convert species presence/absence (pa) to a factor for classification.
train
contains 80% of the data
for model fitting. test
contains 20% of the data for model
validation. fpa
ensures that the response variable (pa) is
treated as a categorical factor.library(randomForest) # Load the randomForest package.
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
crf <- randomForest(train[, 2:ncol(train)], fpa) # Fit a Random Forest model with the training data.
crf # Display model summary.
##
## Call:
## randomForest(x = train[, 2:ncol(train)], y = fpa)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 7.19%
## Confusion matrix:
## 0 1 class.error
## 0 3832 165 0.04128096
## 1 193 790 0.19633774
- *Purpose:*
Fit a Random Forest model using the
training data (train) to predict species presence/absence
(fpa
). - *Interpretation:*
-Model output
(crf
) includes:
–OOB error rate:
The error rate on the out-of-bag samples
(used for model validation). –Variable importance:
Ranking
of features by their contribution to prediction accuracy.
-Random Forest:
Ensemble of decision trees, improving
accuracy by averaging over multiple trees. - *Use:*
Understand model performance, including accuracy and variable
importance.
The output shows a Random Forest classification model with 500 trees and a selection of 4 variables per split. The out-of-bag (OOB) error rate is 7.19%, indicating the model misclassifies about 7.19% of unseen data. The confusion matrix shows the model’s performance with 3832 true negatives, 165 false positives, 193 false negatives, and 790 true positives. The class error for predicting 0 is 4.13%, and for predicting 1 is 19.63%. These values highlight the model’s accuracy and misclassification tendencies.
varImpPlot(crf) # Plot the importance of variables in predicting species presence.
trf <- tuneRF(train[, 2:ncol(train)], train[, "pa"]) # Tune the Random Forest model to find optimal parameters.
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, : The
## response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 6 OOB error = 0.05594974
## Searching left ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, : The
## response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 3 OOB error = 0.05612125
## -0.003065481 0.05
## Searching right ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, : The
## response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 12 OOB error = 0.05485775
## 0.01951734 0.05
trf # Display tuning results.
## mtry OOBError
## 3 3 0.05612125
## 6 6 0.05594974
## 12 12 0.05485775
mt <- trf[which.min(trf[,2]), 1] # Extract the optimal `mtry` (number of variables considered at each split).
mt # Display the optimal `mtry` value.
## [1] 12
rrf <- randomForest(train[, 2:ncol(train)], train[, "pa"], mtry=mt, ntree=250)
## Warning in randomForest.default(train[, 2:ncol(train)], train[, "pa"], mtry =
## mt, : The response has five or fewer unique values. Are you sure you want to
## do regression?
plot(rrf)