Madagascar is a large island located along the eastern coast of Africa, it is home to unique wildlife and have one of the highest rates of species endemism (Antonelli et al. 2022). The island was estimated to have at least 14,000 species of plants which 80% are only found in Madagascar. Unfortunately, agriculture expansion and exploitation of biological resources are threatening Madagascar flora (Ralimanana et al. 2022). One of these plant include Mespilodaphne racemosa which is Near Threatened on the International Union for Conservation of Nature Red List (Rabarimanarivo 2022). Mespilodaphne racemosa belongs to the Lauraceae family and is a small tree found in a wide range of forest types. In addition to the threats above, the plant also suffers from other causes like mining and artificial fires. The plant is a source of construction and structural material for local communities and harvested for medicine. Thus, Mespilodaphne racemosa is part of flora species recovery efforts in Madagascar. To inform conservation plans for this species, this study aims to identify the habitats suitable for Mespilodaphne racemosa on the African island. By using species distribution modelling (Vignali et al. 2020) (Velazco et al. 2022), the model calculates the amount of potential suitable habitats found in current Protected Areas and provides recommendations for the future conservation of the species.
library(rgbif)
library(DT)
library(tidyverse)
library(CoordinateCleaner)
library(rnaturalearth)
library(rnaturalearthdata)
library(sf)
library(mapview)
library(terra)
library(flexsdm)
library(here)
library(corrplot)
library(virtualspecies)
library(SDMtune)
library(exactextractr)
spp <- c("Mespilodaphne racemosa")
gbif_download <- occ_data(scientificName = spp, hasCoordinate = TRUE, country = 'MG', limit = 1000)
gbif_data <- gbif_download$data
gbif_data %>%
filter(!decimalLatitude == 0 | !decimalLongitude == 0) %>%
distinct(decimalLongitude,decimalLatitude,speciesKey,datasetKey, .keep_all = TRUE) -> gbif_filt
gbif_filt %>%
cc_cen(lat = 'decimalLatitude', lon = 'decimalLongitude', buffer = 2000) %>%
cc_cap(lat = 'decimalLatitude', lon = 'decimalLongitude', buffer = 2000) %>%
cc_inst(lat = 'decimalLatitude', lon = 'decimalLongitude', buffer = 2000) %>%
cc_sea(lat = 'decimalLatitude', lon = 'decimalLongitude') -> spp_clean
## Testing sea coordinates
## Testing biodiversity institutions
## Testing country capitals
## Testing country centroids
## Removed 1 records.
## Removed 0 records.
## Removed 0 records.
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\karlp\AppData\Local\Temp\RtmpOMszmv", layer: "ne_110m_land"
## with 127 features
## It has 3 fields
## Removed 85 records.
mad <- ne_countries(country = 'Madagascar', scale = 'medium', returnclass = 'sf')
spp_clean %>%
dplyr::select(key, year, basisOfRecord, locality, coordinateUncertaintyInMeters, occurrenceRemarks, habitat, recordedBy, lon = decimalLongitude, lat = decimalLatitude) -> spp_sel
spp_sf <- st_as_sf(spp_sel, coords = c('lon', 'lat'))
st_crs(spp_sf) = 4326
worldclim <- rast(here("data/rast/afr_worldclim.tif"))
names(worldclim)
## [1] "bio1" "bio2" "bio3" "bio4" "bio5" "bio6" "bio7" "bio8" "bio9"
## [10] "bio10" "bio11" "bio12" "bio13" "bio14" "bio15" "bio16" "bio17" "bio18"
## [19] "bio19"
names(worldclim) <- c("mean_ann_t","mean_diurnal_t_range","isothermality", "t_seas", 'max_t_warm_m','min_t_cold_m',"t_ann_range",'mean_t_wet_q','mean_t_dry_q','mean_t_warm_q','mean_t_cold_q','ann_p', 'p_wet_m','p_dry_m','p_seas','p_wet_q','p_dry_q','p_warm_q','p_cold_q')
wc_mad <- worldclim %>%
crop(., vect(mad)) %>%
mask(., vect(mad))
set.seed(42)
spp_filt <- occfilt_geo(
data = spp_sel %>% rename(x = lon, y = lat),
x = 'x',
y = 'y',
method = c('defined', d = 10),
env_layer = wc_mad,
prj = crs(wc_mad)
)
## Extracting values from raster ...
## 2 records were removed because they have NAs for some variables
## Number of unfiltered records: 365
## Distance threshold(km): 10
## Number of filtered records: 131
spp_filt_sf <- spp_filt %>%
st_as_sf(coords = c('x','y'), crs = st_crs(4326))
spp_removed_sf <- spp_sel %>%
filter(!key %in% spp_filt$key) %>%
st_as_sf(coords = c('lon','lat'), crs = st_crs(4326))
Figure 1. Maps of different environment variables in Madagascar.
set.seed(42)
pseudo_abs <- sample_pseudoabs(
data = spp_filt,
x = 'x',
y = 'y',
n = nrow(spp_filt),
method = c('geo_const', width = 10000),
rlayer = wc_mad,
calibarea = vect(mad)
)
pres <- spp_filt %>%
dplyr::select(x, y) %>%
mutate(pr_ab = 1)
all_pts <- rbind(pres, pseudo_abs)
all_pts_sf <- all_pts %>%
mutate(pr_ab = as.factor(pr_ab)) %>%
st_as_sf(coords = c('x','y'), crs = st_crs(4326))
spp <- c("Mespilodaphne racemosa")
spp_code <- gsub(' ', '_', spp)
all_pts <- vect(here(paste0('output/species_localities/',spp_code, '_all_points.shp')))
mad_sf <- ne_countries(country = 'Madagascar', scale = 'medium', returnclass = 'sf')
mad <- vect(mad_sf)
worldclim <- rast(here("data/rast/afr_worldclim.tif"))
names(worldclim) <- c("mean_ann_t","mean_diurnal_t_range","isothermality", "t_seas", 'max_t_warm_m','min_t_cold_m',"t_ann_range",'mean_t_wet_q','mean_t_dry_q','mean_t_warm_q','mean_t_cold_q','ann_p', 'p_wet_m','p_dry_m','p_seas','p_wet_q','p_dry_q','p_warm_q','p_cold_q')
wc_mad <- worldclim %>%
crop(., mad) %>%
mask(., mad)
wc_mad[[c(1:2,5:11)]] <- wc_mad[[c(1:2,5:11)]]/10
wc_mad[[3:4]] <- wc_mad[[3:4]]/100
plot(wc_mad$mean_ann_t)
plot(mad, add = T)
plot(all_pts, add=T)
crs(wc_mad) == crs(mad)
## [1] FALSE
crs(wc_mad) == crs(all_pts)
## [1] TRUE
crs(mad) == crs(all_pts)
## [1] FALSE
mad <- project(mad, wc_mad)
crs(wc_mad) == crs(mad)
## [1] TRUE
cov_colin <- correct_colinvar(wc_mad, method = c('pearson', th = "0.7"))
corrplot(cov_colin$cor_table, type = 'lower', diag = FALSE, order = 'hclust', tl.cex = 0.6, tl.col = 'black', addCoef.col = 'black')
set.seed(42)
non_colin <- removeCollinearity(raster::stack(wc_mad), 0.7, method = 'pearson', plot = TRUE, select.variables = TRUE, sample.points = FALSE)
non_colin
## [1] "mean_ann_t" "mean_diurnal_t_range" "isothermality"
## [4] "t_seas" "mean_t_wet_q" "p_warm_q"
## [7] "p_wet_q" "p_seas"
wc_mad_sel <- wc_mad[[non_colin]]
set.seed(42)
non_colin_check <- removeCollinearity(raster::stack(wc_mad_sel), 0.7, method = 'pearson', plot = TRUE, select.variables = TRUE, sample.points = FALSE)
wc_mad_sel <- wc_mad_sel[[non_colin_check]]
set.seed(42)
non_colin_check_2 <- removeCollinearity(raster::stack(wc_mad_sel), 0.7, method = 'pearson', plot = TRUE, select.variables = TRUE, sample.points = FALSE)
## - No multicollinearity detected in your data at threshold 0.7
all_pts_df <- as.data.frame(all_pts, geom = 'XY')
SWDdata <- prepareSWD(
species = 'Mespilodaphne racemosa',
p = all_pts_df %>% filter(pr_ab == 1) %>% dplyr::select(x, y),
a = all_pts_df %>% filter(pr_ab == 0) %>% dplyr::select(x, y),
env = wc_mad_sel
)
## Extracting predictor information for presence locations...
## Extracting predictor information for absence/background locations...
SWDdata@data <- SWDdata@data[-1]
split_data <- trainValTest(SWDdata, test = 0.2, seed = 42)
train <- split_data[[1]]
test <- split_data[[2]]
set.seed(42)
rf_model <- train(method = c('RF'), ntrees = 500, data = train)
rf_model
## Object of class SDMmodel
## Method: RF
##
## Species: Mespilodaphne racemosa
## Presence locations: 105
## Absence locations: 105
##
## Model configurations:
## --------------------
## mtry: 2
## ntree: 500
## nodesize: 1
##
## Variables:
## ---------
## Continuous: mean_ann_t mean_diurnal_t_range isothermality t_seas p_warm_q p_wet_q p_seas
## Categorical: NA
cbind(train@pa, predict(rf_model, data = train)) %>%
as.data.frame() %>%
rename(true_class = V1, predicted_class = V2) %>%
mutate(threshold_0.4 = ifelse(predicted_class >= 0.5, 1, 0),
threshold_0.5 = ifelse(predicted_class >= 0.5, 1, 0),
threshold_0.6 = ifelse(predicted_class >= 0.6, 1, 0)) %>%
DT::datatable()
cm <- confMatrix(rf_model, test = test, th = 0.5)
cm_format <- as.data.frame(rbind(c(cm[,2],cm[,3]), c(cm[,4],cm[,5])))
names(cm_format) <- c('Positive', 'Negative')
rownames(cm_format) <- c('Positive', 'Negative')
cm_format %>% DT::datatable()
round((cm$tp + cm$tn)/sum(cm$tp, cm$tp, cm$fp, cm$fn)*100, 2)
## [1] 81.48
tss(rf_model, test = test)
## [1] 0.7307692
plotROC(rf_model, test = test)
## Warning: The following aesthetics were dropped during statistical transformation: m, d
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
auc(rf_model, test = test)
## [1] 0.8816568
(ths <- thresholds(rf_model, test = test))
## Threshold value
## 1 Minimum training presence 0.626
## 2 Equal training sensitivity and specificity 0.626
## 3 Maximum training sensitivity plus specificity 0.626
## 4 Equal test sensitivity and specificity 0.568
## 5 Maximum test sensitivity plus specificity 0.490
## Fractional predicted area Training omission rate Test omission rate
## 1 0.00000000 0 0.11538462
## 2 0.00000000 0 0.11538462
## 3 0.00000000 0 0.11538462
## 4 0.00000000 0 0.07692308
## 5 0.00952381 0 0.03846154
## P-values
## 1 0.000000e+00
## 2 0.000000e+00
## 3 0.000000e+00
## 4 0.000000e+00
## 5 7.909635e-106
ths %>% DT::datatable()
cm_custom <- confMatrix(rf_model, test = test, th = 0.71)
(TPR <- cm_custom$tp/(cm_custom$tp + cm_custom$fn))
## [1] 0.5769231
(FPR <- 1 - cm_custom$tn/(cm_custom$tn + cm_custom$fp))
## [1] 0.1538462
plotROC(rf_model, test = test) +
geom_point(aes(x = FPR, y = TPR), col = 'red', size = 3) +
labs(x = 'False Positive Rate (1 - Specificity)',
y = 'True Positive Rate (Sensitivity)')
## Warning: The following aesthetics were dropped during statistical transformation: m, d
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
vi <- varImp(rf_model, permut = 5)
plotVarImp(vi[,1:2])
SDMtune::plotResponse(rf_model, var = vi$Variable[1], marginal = TRUE, rug = TRUE)
SDMtune::plotResponse(rf_model, var = vi$Variable[2], marginal = TRUE, rug = TRUE)
pred <- predict(rf_model, data = raster::stack(wc_mad_sel))
pred_df <- as.data.frame(pred, xy = TRUE)
ggplot() +
geom_tile(data = pred_df, aes(x = x, y = y, fill = layer)) +
scale_fill_gradientn(colours = c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c"),
na.value = NA, name = 'Habitat\nsuitability') +
geom_sf(data = mad_sf, fill = NA) +
geom_point(data = all_pts_df %>% filter(pr_ab == 1), aes(x = x, y = y)) +
labs(x = 'Longitude', y = 'Latitude') +
theme_minimal()
ths <- thresholds(rf_model, test = test)
th = ths[5,2]
pred_th <- pred >= th
pred_th_df <- as.data.frame(pred_th, xy = TRUE)
ggplot() +
geom_tile(data = pred_th_df, aes(x = x, y = y, fill = as.factor(layer))) +
scale_fill_manual(values = c("#2c7bb6", "#d7191c"), na.value = NA,
name = paste0('Habitat\nsuitability\nbinary (>= ',th,')'),
labels = c('absent','present','')) +
geom_sf(data = mad_sf, fill = NA) +
geom_point(data = all_pts_df %>% filter(pr_ab == 1), aes(x = x, y = y)) +
labs(x = 'Longitude', y = 'Latitude') +
theme_minimal()
prot_areas <- list.files(here('data/vect/WDPA_Madagascar'), pattern = '*.shp', full.names = TRUE)
prot_areas_list <- lapply(prot_areas, read_sf)
prot_areas_all <- bind_rows(prot_areas_list) %>% filter(MARINE == 0)
prot_areas_all %>%
st_transform(crs = 'EPSG:29702') -> prot_areas_all_proj
pred_th %>%
projectRaster(crs = 'EPSG:29702', method = 'ngb') -> pred_th_proj
par(mfrow=c(1,2))
plot(rast(pred_th))
plot(vect(prot_areas_all), add = TRUE)
plot(rast(pred_th_proj))
plot(vect(prot_areas_all_proj), add = TRUE)
pres_area <- (sum(pred_th_proj[] == 1, na.rm = TRUE) * (res(pred_th_proj)[1]*res(pred_th_proj)[2]) / (1000^2))
paste('The area of species presences is',pres_area, 'km2')
## [1] "The area of species presences is 161822.2484 km2"
all_area <- (sum(!is.na(pred_th_proj[])) * (res(pred_th_proj)[1]*res(pred_th_proj)[2]) / (1000^2))
paste('The area of all cells is',all_area, 'km2')
## [1] "The area of all cells is 614746.4504 km2"
paste('The species presences cover',round(pres_area/all_area*100, 2), '% of Madagascar')
## [1] "The species presences cover 26.32 % of Madagascar"
sum_cover <- function(x){
list(x %>%
group_by(value) %>%
summarize(total_area = sum(coverage_area)))
}
extract_all <- exact_extract(pred_th_proj, prot_areas_all_proj, coverage_area = TRUE, summarize_df = TRUE, fun = sum_cover)
## Warning in .local(x, y, ...): Polygons transformed to raster CRS (EPSG:NA)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
# add the names of the protected areas back on to our extraction
names(extract_all) <- prot_areas_all_proj$ORIG_NAME
extract_df <- bind_rows(extract_all, .id = 'ORIG_NAME')
# take a look at the first 6 rows
head(extract_df)
## # A tibble: 6 x 3
## ORIG_NAME value total_area
## <chr> <dbl> <dbl>
## 1 Ankarafantsika 0 1695222372.
## 2 Andohahela 0 314157914.
## 3 Andohahela 1 557847871.
## 4 Marojejy 1 752626634.
## 5 Tsaratanana 0 307788647.
## 6 Tsaratanana 1 1182760985.
area_under_pas <- extract_df %>%
filter(value == 1) %>%
summarise(sum(total_area)/(1000^2))
paste(round(area_under_pas/pres_area * 100, 2),'% of the predicted presences are found within protected areas')
## [1] "30.16 % of the predicted presences are found within protected areas"
iucn_cat <- prot_areas_all_proj %>%
st_drop_geometry() %>%
dplyr::select(ORIG_NAME, IUCN_CAT)
extract_df %>%
left_join(iucn_cat, by = 'ORIG_NAME') %>%
filter(value == 1) %>%
group_by(IUCN_CAT) %>%
summarise(area = sum(total_area)/(1000^2)) %>%
mutate(perc = round(area/sum(area) * 100, 2))
## # A tibble: 7 x 3
## IUCN_CAT area perc
## <chr> <dbl> <dbl>
## 1 Ia 1224. 2.51
## 2 II 16102. 33.0
## 3 IV 3897. 7.98
## 4 Not Applicable 4249. 8.71
## 5 Not Reported 14134. 29.0
## 6 V 5632. 11.5
## 7 VI 3568. 7.31
Q: does your species have enough locality data to accurately predict your species distribution?
Q: how do we know if the model is accurately predicting the distribution of your species?
Q: which bioclimatic variables, which were filtered out earlier due to their high colinearity, might also be important in determining your species distribution?
Q: discuss the major environmental drivers of your species distribution, focus on the direction and magnitude of their influence and discuss this relative to other information you can find regarding your species? (e.g. tree in spiny dry forest grows in arid, strongly seasonal environment)
Q: Name and describe two ways we could make the modeling results more robust.