Installing/ loading the packages

# install.packages('sf')
# install.packages('RStoolbox')
# install.packages('maptools')
# install.packages("devtools")
# install_github("narayanibarve/ENMGadgets")
# install.packages("ncdf4")
library(devtools)
library(ENMGadgets)
require(ENMGadgets)
library(rgeos)
library(rgdal)
library(sf) # This package allows you to import shapefiles
library(RStoolbox) #This package allows you to perform PCA on rasters or raster stacks
library(ggplot2)
library(reshape2)
library(maptools)
library(readr)
library(usdm)
library(spThin)
library(dplyr)
library(dismo)
library(maxnet)
library(mapview)
library(MASS) # for 2D kernel density function
library(magrittr) # for piping functionality, i.e., %>%
library(maptools) # reading shapefiles
library(ncdf4)
library(raster)
require(raster)
library(brainGraph)
library(corrplot)
library(rmaxent)
library(ENMeval)
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)

IMPORT the bioclimatic variables

Here we are using 2.5 arc minute resolution (not exactly sure what that translates to in metric area)

bio1 <- raster("WC_2_2.5/wc2.1_2.5m_bio_1.tif")
bio2 <- raster("WC_2_2.5/wc2.1_2.5m_bio_2.tif")
bio3 <- raster("WC_2_2.5/wc2.1_2.5m_bio_3.tif")
bio4 <- raster("WC_2_2.5/wc2.1_2.5m_bio_4.tif")
bio5 <- raster("WC_2_2.5/wc2.1_2.5m_bio_5.tif")
bio6 <- raster("WC_2_2.5/wc2.1_2.5m_bio_6.tif")
bio7 <- raster("WC_2_2.5/wc2.1_2.5m_bio_7.tif")
bio8 <- raster("WC_2_2.5/wc2.1_2.5m_bio_8.tif")
bio9 <- raster("WC_2_2.5/wc2.1_2.5m_bio_9.tif")
bio10 <- raster("WC_2_2.5/wc2.1_2.5m_bio_10.tif")
bio11 <- raster("WC_2_2.5/wc2.1_2.5m_bio_11.tif")
bio12 <- raster("WC_2_2.5/wc2.1_2.5m_bio_12.tif")
bio13 <- raster("WC_2_2.5/wc2.1_2.5m_bio_13.tif")
bio14 <- raster("WC_2_2.5/wc2.1_2.5m_bio_14.tif")
bio15 <- raster("WC_2_2.5/wc2.1_2.5m_bio_15.tif")
bio16 <- raster("WC_2_2.5/wc2.1_2.5m_bio_16.tif")
bio17 <- raster("WC_2_2.5/wc2.1_2.5m_bio_17.tif")
bio18 <- raster("WC_2_2.5/wc2.1_2.5m_bio_18.tif")
bio19 <- raster("WC_2_2.5/wc2.1_2.5m_bio_19.tif")

bioclim2.5 <- stack(bio1, bio2 ,bio3, bio4, bio5,
                    bio6, bio7, bio8, bio9, bio10,
                    bio11, bio12, bio13, bio14, 
                    bio15, bio16, bio17, bio18,
                    bio19)
names(bioclim2.5) <- c('bio01', 'bio02', 'bio03', 'bio04', 
                       'bio05', 'bio06', 'bio07', 'bio08', 
                       'bio09', 'bio10', 'bio11', 'bio12', 
                       'bio13', 'bio14', 'bio15', 'bio16', 
                       'bio17', 'bio18', 'bio19')
crs(bioclim2.5) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"

Importing presence data

These presence data have already been cleaned

x_trop_pres_all <- read.csv('data/clean_x_trop_NEW.csv')# This is the full dataframe (we will only use this for creating the accessible area)
x_trop_pres_thin_2.5 <- read.csv('data/x_tropicalis_2.5km_thin1_new.csv')# This is the dataframe we will use (thinned by 2.5 km)

african_anurans <- read.csv('data/clean_all_anurans.csv') #This is a dataframe of points (within our area of interest) for all African anurans.

Importing the native range extent

Because we are using the native range to generate background data, we need to use the native range to ‘clip’ the bioclim raster. This is easy to do by creating an account in the IUCN redlist, and then just downloading a shapefile of the geographic range. Then I saved the shapefile in the working directory, and this command

x_trop_native_range_shapefile <- shapefile('data/x_trop_sf.shp')
projection(bioclim2.5)
## [1] "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
projection(x_trop_native_range_shapefile)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
crs(x_trop_native_range_shapefile) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"

Here I am just making sure both the bioclim data and the range shapefile share the same projection. In this instance they do, but you can reproject them to match in R

Reprojecting all shapefiles

Because all three are in different projections, I have to reproject them so that they match

north_am_map <- ne_countries(scale = "large", continent = 'north america')
africa_map <- ne_countries(scale = 'large', continent = 'africa')
plot(africa_map)

##Matching projections first
crs(africa_map) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
projection(north_am_map)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
us_map <- ne_states(country = 'United States of America')
FL_map <- us_map[us_map$name == 'Florida',]
crs(FL_map) <- crs(bioclim2.5)
US_contig <- us_map[us_map$name != 'Alaska',]
US_contig <- us_map[us_map$name != 'Hawaii' & us_map$name != 'Alaska',]
crs(US_contig) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"

Creating accesible area

I am going to do this by adding a buffer around the IUCN native range polygon. Given the relatively high uncertainty with the range, I decided to go with creating a 250 km buffer around all occurrence points.
x_trop_pres_all <- data.frame(x_trop_pres_all)
coordinates(x_trop_pres_all) <- ~ decimalLongitude + decimalLatitude
crs(x_trop_pres_all) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0" 
x_trop_native_range__buffer_small = buffer(x_trop_pres_all, width = 250000, dissolve = T)
plot(x_trop_native_range__buffer_small) #But this extends into the ocean now

x_trop_native_range__buffer_no_water <- gIntersection(x_trop_native_range__buffer_small, africa_map, byid=FALSE) # This clips the buffer so that it does not extends into the ocean
plot(x_trop_native_range__buffer_no_water)

Cropping the Bioclim raster

Here I am cropping the the bioclim raster stack based on the native range shapefile I imported above. Then I am plotting all the bioclim variables to demonstrate they are clipped in the extent of the imported shapefile

crs(bioclim2.5) <- '+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0'

NR_bioclim2.5 <- crop(bioclim2.5, extent(x_trop_native_range__buffer_no_water))
NR_bioclim2.5 <- mask(NR_bioclim2.5, x_trop_native_range__buffer_no_water)
plot(x_trop_native_range__buffer_no_water)

FL_bioclim2.5 <- crop(bioclim2.5, extent(FL_map))
FL_bioclim2.5 <- mask(FL_bioclim2.5, FL_map)
plot(FL_bioclim2.5)

Reducing the bioclim dataset

Now I am first going to look at the correlation structure, and then remove variables that are above 0.75 correlation coefficient
NR_bioclim2.5_cor_df <- na.omit(as.matrix(NR_bioclim2.5))
names(NR_bioclim2.5) <- c('bio01', 'bio02', 'bio03', 'bio04', 
                                         'bio05', 'bio06', 'bio07', 'bio08', 
                                         'bio09', 'bio10', 'bio11', 'bio12', 
                                         'bio13', 'bio14', 'bio15', 'bio16', 
                                         'bio17', 'bio18', 'bio19')

res <- cor(NR_bioclim2.5_cor_df)
round(res, 3)
##        bio01  bio02  bio03  bio04  bio05  bio06  bio07  bio08  bio09  bio10
## bio01  1.000  0.179 -0.393  0.468  0.685  0.177  0.276  0.838  0.649  0.900
## bio02  0.179  1.000 -0.621  0.664  0.802 -0.888  0.957  0.161 -0.417  0.448
## bio03 -0.393 -0.621  1.000 -0.849 -0.799  0.637 -0.810 -0.277  0.237 -0.710
## bio04  0.468  0.664 -0.849  1.000  0.836 -0.631  0.826  0.484 -0.277  0.793
## bio05  0.685  0.802 -0.799  0.836  1.000 -0.565  0.879  0.552  0.023  0.880
## bio06  0.177 -0.888  0.637 -0.631 -0.565  1.000 -0.890  0.099  0.741 -0.192
## bio07  0.276  0.957 -0.810  0.826  0.879 -0.890  1.000  0.247 -0.416  0.597
## bio08  0.838  0.161 -0.277  0.484  0.552  0.099  0.247  1.000  0.450  0.754
## bio09  0.649 -0.417  0.237 -0.277  0.023  0.741 -0.416  0.450  1.000  0.306
## bio10  0.900  0.448 -0.710  0.793  0.880 -0.192  0.597  0.754  0.306  1.000
## bio11  0.710 -0.244  0.117 -0.244  0.160  0.594 -0.255  0.436  0.864  0.392
## bio12 -0.404 -0.425  0.623 -0.694 -0.593  0.365 -0.538 -0.439  0.088 -0.607
## bio13 -0.214 -0.151  0.262 -0.393 -0.266  0.110 -0.211 -0.284  0.027 -0.320
## bio14 -0.320 -0.492  0.675 -0.510 -0.601  0.414 -0.571 -0.140  0.052 -0.501
## bio15  0.420  0.709 -0.785  0.865  0.786 -0.674  0.824  0.441 -0.267  0.692
## bio16 -0.243 -0.152  0.272 -0.429 -0.282  0.111 -0.220 -0.345  0.029 -0.352
## bio17 -0.344 -0.544  0.756 -0.590 -0.663  0.475 -0.641 -0.142  0.084 -0.558
## bio18 -0.526 -0.481  0.813 -0.748 -0.747  0.388 -0.637 -0.369 -0.029 -0.744
## bio19 -0.124 -0.440  0.481 -0.545 -0.415  0.494 -0.515 -0.308  0.343 -0.340
##        bio11  bio12  bio13  bio14  bio15  bio16  bio17  bio18  bio19
## bio01  0.710 -0.404 -0.214 -0.320  0.420 -0.243 -0.344 -0.526 -0.124
## bio02 -0.244 -0.425 -0.151 -0.492  0.709 -0.152 -0.544 -0.481 -0.440
## bio03  0.117  0.623  0.262  0.675 -0.785  0.272  0.756  0.813  0.481
## bio04 -0.244 -0.694 -0.393 -0.510  0.865 -0.429 -0.590 -0.748 -0.545
## bio05  0.160 -0.593 -0.266 -0.601  0.786 -0.282 -0.663 -0.747 -0.415
## bio06  0.594  0.365  0.110  0.414 -0.674  0.111  0.475  0.388  0.494
## bio07 -0.255 -0.538 -0.211 -0.571  0.824 -0.220 -0.641 -0.637 -0.515
## bio08  0.436 -0.439 -0.284 -0.140  0.441 -0.345 -0.142 -0.369 -0.308
## bio09  0.864  0.088  0.027  0.052 -0.267  0.029  0.084 -0.029  0.343
## bio10  0.392 -0.607 -0.320 -0.501  0.692 -0.352 -0.558 -0.744 -0.340
## bio11  1.000  0.094  0.123 -0.049 -0.177  0.127 -0.026 -0.058  0.272
## bio12  0.094  1.000  0.840  0.579 -0.552  0.872  0.614  0.743  0.744
## bio13  0.123  0.840  1.000  0.279 -0.118  0.982  0.280  0.405  0.598
## bio14 -0.049  0.579  0.279  1.000 -0.565  0.244  0.975  0.731  0.291
## bio15 -0.177 -0.552 -0.118 -0.565  1.000 -0.161 -0.651 -0.721 -0.455
## bio16  0.127  0.872  0.982  0.244 -0.161  1.000  0.250  0.425  0.646
## bio17 -0.026  0.614  0.280  0.975 -0.651  0.250  1.000  0.798  0.332
## bio18 -0.058  0.743  0.405  0.731 -0.721  0.425  0.798  1.000  0.462
## bio19  0.272  0.744  0.598  0.291 -0.455  0.646  0.332  0.462  1.000
res[ res > -0.75 & res < 0.75 ] = 0
corrplot(res, type = "upper", order = "alphabet", 
         tl.col = "black", tl.srt = 45, method = "number")

#Now I am just rerunning the model with the chosen variables
NR_bioclim2.5_cor_df <- na.omit(as.matrix(NR_bioclim2.5[[c(4,6,8,9,16,17),]]))

res <- cor(NR_bioclim2.5_cor_df)
round(res, 3)
##        bio04  bio06  bio08  bio09  bio16  bio17
## bio04  1.000 -0.631  0.484 -0.277 -0.429 -0.590
## bio06 -0.631  1.000  0.099  0.741  0.111  0.475
## bio08  0.484  0.099  1.000  0.450 -0.345 -0.142
## bio09 -0.277  0.741  0.450  1.000  0.029  0.084
## bio16 -0.429  0.111 -0.345  0.029  1.000  0.250
## bio17 -0.590  0.475 -0.142  0.084  0.250  1.000
res[ res > -0.75 & res < 0.75 ] = 0
corrplot(res, type = "upper", order = "alphabet", 
         tl.col = "black", tl.srt = 45, method = "number")

# Creating the reduced stacks
NR_bioclim2.5_reduced <- NR_bioclim2.5[[c(4,6,8,9,16,17),]]
FL_bioclim2.5_reduced <- FL_bioclim2.5[[c(4,6,8,9,16,17),]]

#I am also going to add a raster I made that represents forest cover. It is a categorical raster where values of 1 are forest, and values of 0 are all other land category types.

FL_forest_resam_mask <- raster('forest_cover/FL_forest_resam_mask.tif')
NR_forest_resam_mask <- raster('forest_cover/NR_forest_resam_mask.tif')
crs(FL_forest_resam_mask)<- '+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0'
crs(NR_forest_resam_mask)<- '+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0'

# Adding the forest raster to the bioclim raster stack
NR_bioclim2.5_reduced <- NR_bioclim2.5[[c(4,6,8,9,16,17),]]
NR_bioclim2.5_reduced <- stack(NR_bioclim2.5_reduced, NR_forest_resam_mask)
names(NR_bioclim2.5_reduced[[7]]) <- 'forest'

FL_bioclim2.5_reduced <- FL_bioclim2.5[[c(4,6,8,9,16,17),]]
FL_bioclim2.5_reduced <- stack(FL_bioclim2.5_reduced, FL_forest_resam_mask)
names(FL_bioclim2.5_reduced[[7]]) <- 'forest'

Creating the bias layer

To do this, we will use the african anuran dataframe. We will create a raster heatmap based on the density of points across our accesible area
#First we have to call this function, which will be used later to sample from the raster (this is what Rob was talking about, but I could not get the package to load properly)

sampleRast <- function(x, n, adjArea = TRUE, replace = TRUE, prob = TRUE) {
  
  val <- as.vector(x[[1]])
  
  # adjust probabilities for cell area and/or cell values
  if (adjArea) {
    
    areas <- raster::area(x[[1]], na.rm=TRUE)
    areas <- as.vector(areas)
    probs <- if (prob) {
      val * areas
    } else {
      areas
    }
    
  } else if (!adjArea & prob) {
    
    if (any(probs < 0)) warning('Some probabilities are < 0.', .immediate=TRUE)
    probs <- val
    
  } else if (!adjArea & !prob) {
    
    probs <- rep(1, length(val))
    
  }
  
  cellNum <- 1:raster::ncell(x)
  cellNum <- cellNum[!is.na(val)]
  
  probs <- probs[!is.na(val)]
  
  sites <- sample(cellNum, size=n, replace=replace, prob=probs)
  xy <- raster::xyFromCell(x, sites)
  
  xy
  
}

#bias layer function
biaslayer <- function(occs_df,longitude, latitude, raster_mold){
  ll_ras <- raster::rasterize(occs_df[,c(longitude,latitude)],
                              raster_mold, 1)
  ll_rasIDs <- which(raster::getValues(ll_ras) %in% 1)
  occ_T <- sp::coordinates(ll_ras)[ll_rasIDs,]

  dens <- MASS::kde2d(occ_T[,1], occ_T[,2],
                      n = c(nrow(ll_ras),
                            ncol(ll_ras)))
  biasLayer <- raster::raster(dens)
  if(!all(dim(biasLayer)[1:2] == dim(raster_mold)[1:2]))
    biasLayer <- raster::resample(biasLayer,raster_mold)
  return(biasLayer)
}

african_anurans <- data.frame(african_anurans)
coordinates(african_anurans) <- ~ decimalLongitude + decimalLatitude
crs(african_anurans) <- crs(NR_bioclim2.5)
african_anurans_crop <- crop(african_anurans, NR_bioclim2.5)
                             
african_anurans_crop <- data.frame(african_anurans_crop)

anuran_bias_layer <- biaslayer(occs_df =african_anurans_crop, 
                               longitude = 'decimalLongitude', latitude = 'decimalLatitude', 
                               raster_mold = NR_bioclim2.5_reduced$bio04)
plot(anuran_bias_layer)

anuran_bias_layer<- crop(anuran_bias_layer, NR_bioclim2.5_reduced[[1]])
anuran_bias_layer<- mask(anuran_bias_layer, NR_bioclim2.5_reduced[[1]])

background_T<-sampleRast(anuran_bias_layer,2000,adjArea=TRUE, replace=FALSE,prob=TRUE) #sampling from the surrogate species bias layer to create a set of bias points

plot(anuran_bias_layer)
points(background_T)

#####As you can see, there are a greater number of these points in areas with higher density of Anuran occurrence

NR_bioclim2.5_df2 <- data.frame(rasterToPoints(NR_bioclim2.5_reduced))


mess1 <- mess(FL_bioclim2.5_reduced[[1:6]], NR_bioclim2.5_df2[3:8])
plot(mess1)

mess1[mess1<0]=-1
mess1[mess1>=0]=1
mess2 <- mess(FL_bioclim2.5_reduced[[2:6]], NR_bioclim2.5_df2[4:8])
plot(mess2)

mess1[mess1<0]=-1
mess1[mess1>=0]=1
The takeaway here is that as a result of bio4 (seasonality), we are extrapolating across most of Florida. Without that variable, almost everything is available

Model 1 set up (no bias layer)

#Converting the raster grids to a points dataframe
NR_bioclim2.5_reduced_df <-rasterToPoints(NR_bioclim2.5_reduced, spatial=TRUE)
NR_bioclim2.5_reduced_df <- data.frame(NR_bioclim2.5_reduced_df, 
                            lon=coordinates(NR_bioclim2.5_reduced_df)[,1],
                            lat=coordinates(NR_bioclim2.5_reduced_df)[,2])             

coordinates(NR_bioclim2.5_reduced_df) <- ~lon +lat
NR_bioclim2.5_reduced_df <- data.frame(NR_bioclim2.5_reduced_df)

## Generating 10000 points for background data
bg <- randomPoints(NR_bioclim2.5_reduced[[1]], n=10000)
bg <- as.data.frame(bg)
bg.grp <- rep(0, 2000) # generating a blank vector with a length of 2000 which will serve as the actual background points

#Here, we are using random k-fold validation to split the data into training/ testing. We are using 10 folds, meaning there will be 10 different models averaged together to determin test/ training AUC
random <- get.randomkfold(x_trop_pres_thin_2.5, bg, k=10)
coordinates(x_trop_pres_thin_2.5) <- ~ decimalLongitude + decimalLatitude
#Plot to see the results of the K-folds
plot(NR_bioclim2.5[[1]], legend=FALSE)
points(x_trop_pres_thin_2.5, pch=21, bg=random$occ.grp)

Model 1

Attributes of this model:

All presence points (thinned so that none are within 2.5 km from each other)
Background points are randomly selected from entire accessible area
4 diff models being tested - linear only, linear-quadratic, linear-product, linear-quadratic-product
Beta multiplier of 1
# The actual model
model_point_thin_kfold <- ENMevaluate(occ = coordinates(x_trop_pres_thin_2.5),
                                      env=NR_bioclim2.5_reduced,
                                      bg.coords =bg ,occ.grp = random$occ.grp ,
                                      bg.grp = bg.grp, RMvalues =1, 
                                      fc=c('L','LQ','LP','LQP'), categoricals= 'forest', method = 'user',
                                      algorithm = 'maxent.jar',clamp = TRUE,rasterPreds = TRUE,
                                      parallel = TRUE,progbar = TRUE, updateProgress = TRUE)
## Loading required namespace: rJava
## *** Running ENMevaluate using Maxent 3.4.1 via dismo 1.1.4 ***
## Doing user-defined evaluation groups...
## There are 1 occurrence records with NA for at least
##                   one predictor variable. Removing these records from analysis,
##                   resulting in 140 records...
## Of 4 total cores using 4
## Running in parallel...
## ENMeval completed in 3 minutes 10.4 seconds.

Model 1 results

#This will be a table with AICc, and corresponding AUCs for each model
model_point_thin_kfold@results
#Here, the model with the lowest AICc is model 3, which uses only linear and product responses
#Exploring the best model more in depth
aic.opt <- model_point_thin_kfold@models[[which(model_point_thin_kfold@results$delta.AICc==0)]]
aic.opt@results
##                                                                                          [,1]
## X.Training.samples                                                                   140.0000
## Regularized.training.gain                                                              0.6444
## Unregularized.training.gain                                                            0.7011
## Iterations                                                                           500.0000
## Training.AUC                                                                           0.8203
## X.Background.points                                                                10000.0000
## bio04.contribution                                                                    49.2825
## bio06.contribution                                                                     6.8817
## bio08.contribution                                                                     7.6385
## bio09.contribution                                                                    23.1694
## bio16.contribution                                                                     3.7010
## bio17.contribution                                                                     8.4180
## forest.contribution                                                                    0.9088
## bio04.permutation.importance                                                          69.3905
## bio06.permutation.importance                                                           9.5559
## bio08.permutation.importance                                                           2.5498
## bio09.permutation.importance                                                           2.8316
## bio16.permutation.importance                                                           5.7066
## bio17.permutation.importance                                                           9.9656
## forest.permutation.importance                                                          0.0000
## Entropy                                                                                8.5648
## Prevalence..average.probability.of.presence.over.background.sites.                     0.3131
## Fixed.cumulative.value.1.cumulative.threshold                                          1.0000
## Fixed.cumulative.value.1.Cloglog.threshold                                             0.0501
## Fixed.cumulative.value.1.area                                                          0.7824
## Fixed.cumulative.value.1.training.omission                                             0.0071
## Fixed.cumulative.value.5.cumulative.threshold                                          5.0000
## Fixed.cumulative.value.5.Cloglog.threshold                                             0.1489
## Fixed.cumulative.value.5.area                                                          0.5696
## Fixed.cumulative.value.5.training.omission                                             0.0571
## Fixed.cumulative.value.10.cumulative.threshold                                        10.0000
## Fixed.cumulative.value.10.Cloglog.threshold                                            0.2543
## Fixed.cumulative.value.10.area                                                         0.4486
## Fixed.cumulative.value.10.training.omission                                            0.0714
## Minimum.training.presence.cumulative.threshold                                         0.0509
## Minimum.training.presence.Cloglog.threshold                                            0.0110
## Minimum.training.presence.area                                                         0.9659
## Minimum.training.presence.training.omission                                            0.0000
## X10.percentile.training.presence.cumulative.threshold                                 17.5063
## X10.percentile.training.presence.Cloglog.threshold                                     0.3934
## X10.percentile.training.presence.area                                                  0.3468
## X10.percentile.training.presence.training.omission                                     0.1000
## Equal.training.sensitivity.and.specificity.cumulative.threshold                       28.4949
## Equal.training.sensitivity.and.specificity.Cloglog.threshold                           0.5468
## Equal.training.sensitivity.and.specificity.area                                        0.2571
## Equal.training.sensitivity.and.specificity.training.omission                           0.2571
## Maximum.training.sensitivity.plus.specificity.cumulative.threshold                    16.9019
## Maximum.training.sensitivity.plus.specificity.Cloglog.threshold                        0.3811
## Maximum.training.sensitivity.plus.specificity.area                                     0.3533
## Maximum.training.sensitivity.plus.specificity.training.omission                        0.0929
## Balance.training.omission..predicted.area.and.threshold.value.cumulative.threshold     1.2965
## Balance.training.omission..predicted.area.and.threshold.value.Cloglog.threshold        0.0602
## Balance.training.omission..predicted.area.and.threshold.value.area                     0.7547
## Balance.training.omission..predicted.area.and.threshold.value.training.omission        0.0071
## Equate.entropy.of.thresholded.and.original.distributions.cumulative.threshold          6.5382
## Equate.entropy.of.thresholded.and.original.distributions.Cloglog.threshold             0.1780
## Equate.entropy.of.thresholded.and.original.distributions.area                          0.5243
## Equate.entropy.of.thresholded.and.original.distributions.training.omission             0.0571
#Here we are looking at the importance of each variable
var.importance(aic.opt)
#These are the coefficients for each variable in the model. The interesting part here is that there appears to be a negative relationship with forest. I suspect this is because forested areas are probably less well-sampled. We should mention this in our talk
parse_lambdas(lambdas = model_point_thin_kfold@models[[4]])
## 
## Features with non-zero weights
## 
##        feature   lambda       min       max        type
##  (forest==0.0)   0.0180     0.000 1.000e+00 categorical
##  (forest==1.0)  -0.1795     0.000 1.000e+00 categorical
##          bio04  -9.2947    57.808 4.373e+02      linear
##          bio06  -8.4929     9.368 2.293e+01      linear
##          bio09  -2.9338    17.677 3.009e+01      linear
##          bio16  10.7476    93.000 2.691e+03      linear
##          bio17   0.3265     0.000 4.740e+02      linear
##        bio06^2   6.9108    87.759 5.259e+02   quadratic
##        bio08^2   1.6133   232.339 8.948e+02   quadratic
##        bio16^2  -1.3068  8649.000 7.241e+06   quadratic
##        bio17^2  -3.4569     0.000 2.247e+05   quadratic
##    bio04*bio06   6.9850   792.782 5.574e+03     product
##    bio04*bio16 -12.3507 38392.383 4.719e+05     product
##    bio04*bio17   6.2465     0.000 5.452e+04     product
##    bio08*bio16   0.6736  2711.384 7.087e+04     product
##    bio16*bio17  -2.9645     0.000 6.494e+05     product
## 
## 
## Features with zero weights
## 
##  feature lambda   min   max   type
##    bio08      0 15.24 29.91 linear
#Lastly, the response curves for the best model
response(model_point_thin_kfold@models[[3]])

Model 1: predictions

#Predicting relative probability over the accessible area
NR_point_thin_pred <- (predict(model_point_thin_kfold@models[[3]],NR_bioclim2.5_reduced))
plot(NR_point_thin_pred)

#Predicting over Florida
FL_point_thin_pred <- (predict(model_point_thin_kfold@models[[3]],FL_bioclim2.5_reduced))
plot(FL_point_thin_pred)

#Setting the threshold for presence absence to 5%
pvtest <- data.frame(raster::extract(NR_point_thin_pred, coordinates(x_trop_pres_thin_2.5)))
pvtest_na <- na.omit(pvtest)
colnames(pvtest_na) <- 'probability'
quantile(pvtest_na$probability, 0.05)
##       5% 
## 0.146155
FL_point_thin_pred_thresholded <- FL_point_thin_pred
FL_point_thin_pred_thresholded[FL_point_thin_pred_thresholded <= quantile(pvtest_na$probability, 0.05) ] = 0
FL_point_thin_pred_thresholded[FL_point_thin_pred_thresholded >quantile(pvtest_na$probability, 0.05) ] = 1

#Plotting the thresholded model
plot(FL_point_thin_pred_thresholded)

#Setting the threshold for presence absence to 10%
pvtest <- data.frame(raster::extract(NR_point_thin_pred, coordinates(x_trop_pres_thin_2.5)))
pvtest_na <- na.omit(pvtest)
colnames(pvtest_na) <- 'probability'
quantile(pvtest_na$probability, 0.1)
##      10% 
## 0.392179
FL_point_thin_pred_thresholded <- FL_point_thin_pred
FL_point_thin_pred_thresholded[FL_point_thin_pred_thresholded <= quantile(pvtest_na$probability, 0.1) ] = 0
FL_point_thin_pred_thresholded[FL_point_thin_pred_thresholded >quantile(pvtest_na$probability, 0.1) ] = 1

#Plotting the thresholded model
plot(FL_point_thin_pred_thresholded)

Model 2 the set up (with bias layer)

The only thing that will be differ is that we will be sampling from the bias layer.

Reminder: Essentially, the point of the surrogate species layer is to determine the areas that we have evidence a we sampled. We want to maximize the odds of our background data being from areas where our species is not. Given this, we are preferentially sampling from areas where there are more occurrences of other anurans.

#Just creating an empty vector with the length of the bias file points we made earlier
bg.grp2 <- rep(0, nrow(background_T))

Model 2 (with bias layer)

model_bias_layer_kfold <- ENMevaluate(occ = coordinates(x_trop_pres_thin_2.5),
                                      env=NR_bioclim2.5_reduced,
                                      bg.coords =background_T ,occ.grp = random$occ.grp ,
                                      bg.grp = bg.grp2, RMvalues =1, 
                                      fc=c('L','LQ','LP','LQP'),categoricals= 'forest', method = 'user',
                                      algorithm = 'maxent.jar',clamp = TRUE,rasterPreds = TRUE,
                                      parallel = TRUE,progbar = TRUE, updateProgress = TRUE)
## *** Running ENMevaluate using Maxent 3.4.1 via dismo 1.1.4 ***
## Doing user-defined evaluation groups...
## There are 1 occurrence records with NA for at least
##                   one predictor variable. Removing these records from analysis,
##                   resulting in 140 records...
## Of 4 total cores using 4
## Running in parallel...
## ENMeval completed in 1 minutes 39.4 seconds.

Model 2: exploring the results

#This will be a table with AICc, and corresponding AUCs for each model
model_bias_layer_kfold@results
#Here, the model with the lowest AICc is model 2, which uses only linear and quadratic responses
#Exploring the best model more in depth
aic.opt <- model_bias_layer_kfold@models[[which(model_bias_layer_kfold@results$delta.AICc==0)]]
aic.opt@results
##                                                                                         [,1]
## X.Training.samples                                                                  140.0000
## Regularized.training.gain                                                             0.3289
## Unregularized.training.gain                                                           0.3922
## Iterations                                                                          500.0000
## Training.AUC                                                                          0.7393
## X.Background.points                                                                2000.0000
## bio04.contribution                                                                   20.2926
## bio06.contribution                                                                   14.4640
## bio08.contribution                                                                   24.4935
## bio09.contribution                                                                    5.2310
## bio16.contribution                                                                   11.4872
## bio17.contribution                                                                   19.7694
## forest.contribution                                                                   4.2623
## bio04.permutation.importance                                                         16.0417
## bio06.permutation.importance                                                          6.6748
## bio08.permutation.importance                                                         18.5616
## bio09.permutation.importance                                                         22.3335
## bio16.permutation.importance                                                          7.1537
## bio17.permutation.importance                                                         26.4045
## forest.permutation.importance                                                         2.8302
## Entropy                                                                               7.2709
## Prevalence..average.probability.of.presence.over.background.sites.                    0.4368
## Fixed.cumulative.value.1.cumulative.threshold                                         1.0000
## Fixed.cumulative.value.1.Cloglog.threshold                                            0.1496
## Fixed.cumulative.value.1.area                                                         0.9440
## Fixed.cumulative.value.1.training.omission                                            0.0000
## Fixed.cumulative.value.5.cumulative.threshold                                         5.0000
## Fixed.cumulative.value.5.Cloglog.threshold                                            0.2200
## Fixed.cumulative.value.5.area                                                         0.8025
## Fixed.cumulative.value.5.training.omission                                            0.0286
## Fixed.cumulative.value.10.cumulative.threshold                                       10.0000
## Fixed.cumulative.value.10.Cloglog.threshold                                           0.2974
## Fixed.cumulative.value.10.area                                                        0.6820
## Fixed.cumulative.value.10.training.omission                                           0.0714
## Minimum.training.presence.cumulative.threshold                                        1.0352
## Minimum.training.presence.Cloglog.threshold                                           0.1501
## Minimum.training.presence.area                                                        0.9425
## Minimum.training.presence.training.omission                                           0.0000
## X10.percentile.training.presence.cumulative.threshold                                11.6077
## X10.percentile.training.presence.Cloglog.threshold                                    0.3156
## X10.percentile.training.presence.area                                                 0.6505
## X10.percentile.training.presence.training.omission                                    0.1000
## Equal.training.sensitivity.and.specificity.cumulative.threshold                      36.1898
## Equal.training.sensitivity.and.specificity.Cloglog.threshold                          0.5268
## Equal.training.sensitivity.and.specificity.area                                       0.3215
## Equal.training.sensitivity.and.specificity.training.omission                          0.3214
## Maximum.training.sensitivity.plus.specificity.cumulative.threshold                   45.3326
## Maximum.training.sensitivity.plus.specificity.Cloglog.threshold                       0.6017
## Maximum.training.sensitivity.plus.specificity.area                                    0.2420
## Maximum.training.sensitivity.plus.specificity.training.omission                       0.3714
## Balance.training.omission..predicted.area.and.threshold.value.cumulative.threshold    1.0352
## Balance.training.omission..predicted.area.and.threshold.value.Cloglog.threshold       0.1501
## Balance.training.omission..predicted.area.and.threshold.value.area                    0.9425
## Balance.training.omission..predicted.area.and.threshold.value.training.omission       0.0000
## Equate.entropy.of.thresholded.and.original.distributions.cumulative.threshold         8.3077
## Equate.entropy.of.thresholded.and.original.distributions.Cloglog.threshold            0.2707
## Equate.entropy.of.thresholded.and.original.distributions.area                         0.7185
## Equate.entropy.of.thresholded.and.original.distributions.training.omission            0.0571
#Here we are looking at the importance of each variable
var.importance(aic.opt)
#These are the coefficients for each variable in the model. The interesting part here is that there appears to be a negative relationship with forest. I suspect this is because forested areas are probably less well-sampled. We should mention this in our talk
parse_lambdas(lambdas = model_bias_layer_kfold@models[[2]])
## 
## Features with non-zero weights
## 
##        feature   lambda       min       max        type
##  (forest==0.0)  0.18705     0.000 1.000e+00 categorical
##  (forest==1.0) -0.09713     0.000 1.000e+00 categorical
##          bio04 -4.88232    59.602 4.171e+02      linear
##          bio06 -1.75674     9.536 2.284e+01      linear
##          bio08  6.27506    14.767 2.945e+01      linear
##          bio09 -1.06353    17.345 2.926e+01      linear
##          bio16  0.19720   130.000 2.618e+03      linear
##          bio17  2.69677     0.000 4.550e+02      linear
##        bio09^2 -1.73788   300.837 8.560e+02   quadratic
##        bio16^2  2.02128 16900.000 6.854e+06   quadratic
##        bio17^2 -0.16032     0.000 2.070e+05   quadratic
#Lastly, the response curves for the best model
response(model_bias_layer_kfold@models[[2]])

Model 2: predictions

#Predicting relative probability over the accessible area
NR_bias_pred <- (predict(model_bias_layer_kfold@models[[2]],NR_bioclim2.5_reduced))
plot(NR_bias_pred)

#Predicting over Florida
FL_NR_bias_pred<- (predict(model_bias_layer_kfold@models[[2]],FL_bioclim2.5_reduced))
plot(FL_NR_bias_pred)

#Setting the threshold for presence absence to 5%
pvtest <- data.frame(raster::extract(NR_bias_pred, coordinates(x_trop_pres_thin_2.5)))
pvtest_na <- na.omit(pvtest)
colnames(pvtest_na) <- 'probability'
quantile(pvtest_na$probability, 0.05)
##        5% 
## 0.2800685
FL_NR_bias_pred_thresholded <- FL_NR_bias_pred
FL_NR_bias_pred_thresholded[FL_NR_bias_pred_thresholded <= quantile(pvtest_na$probability, 0.05) ] = 0
FL_NR_bias_pred_thresholded[FL_NR_bias_pred_thresholded >quantile(pvtest_na$probability, 0.05) ] = 1

#Plotting the thresholded model
plot(FL_NR_bias_pred_thresholded)

#Setting the threshold for presence absence to 10%
pvtest <- data.frame(raster::extract(NR_bias_pred, coordinates(x_trop_pres_thin_2.5)))
pvtest_na <- na.omit(pvtest)
colnames(pvtest_na) <- 'probability'
quantile(pvtest_na$probability, 0.1)
##      10% 
## 0.308446
FL_NR_bias_pred_thresholded <- FL_NR_bias_pred
FL_NR_bias_pred_thresholded[FL_NR_bias_pred_thresholded <= quantile(pvtest_na$probability, 0.1) ] = 0
FL_NR_bias_pred_thresholded[FL_NR_bias_pred_thresholded >quantile(pvtest_na$probability, 0.1) ] = 1

#Plotting the thresholded model
plot(FL_NR_bias_pred_thresholded)