Installing/ loading packages

# install.packages('rnaturalearth') #high res world map
# # On my version of R, I have to install this package from Github
# install_github('ropensci/rnaturalearthhires') #This is the higher version of the package above (they both need to be installed)
# install.packages('gdalUtils')
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)
library(gdalUtils)
library(devtools)
library(ENMGadgets)
require(ENMGadgets)
library(rgeos)
library(rgdal)
library(sf) # This package allows you to import shapefiles
library(RStoolbox) #This package allows you to perform PCA on rasters or raster stacks
library(ggplot2)
library(reshape2)
library(maptools)
library(readr)
library(usdm)
library(spThin)
library(dplyr)
library(dismo)
library(maxnet)
library(mapview)
library(MASS) # for 2D kernel density function
library(magrittr) # for piping functionality, i.e., %>%
library(maptools) # reading shapefiles
library(ncdf4)
library(raster)
require(raster)
library(brainGraph)
library(corrplot)

Cleaning Occurrence records

Here I will be cleaning both the L. carinatus records as well as the bias layer records. For the bias layer, I am using the presence of all lizard families with a good number of diurnal species (@Natalie, since you know more than I do on this, feel free to let me know which clades you think do not belong)
## Importing occurrence data 
le_ca_occ <- read_delim("data/l_carianatus_GBIF_occurrence.txt", 
                        "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   gbifID = col_double(),
##   accessRights = col_character(),
##   bibliographicCitation = col_character(),
##   created = col_date(format = ""),
##   identifier = col_character(),
##   language = col_character(),
##   license = col_character(),
##   modified = col_datetime(format = ""),
##   references = col_character(),
##   rights = col_character(),
##   rightsHolder = col_character(),
##   type = col_character(),
##   institutionID = col_character(),
##   collectionID = col_double(),
##   institutionCode = col_character(),
##   collectionCode = col_character(),
##   datasetName = col_character(),
##   ownerInstitutionCode = col_character(),
##   basisOfRecord = col_character(),
##   informationWithheld = col_character()
##   # ... with 96 more columns
## )
## See spec(...) for full column specifications.
## Warning: 1132 parsing failures.
##  row          col           expected                                                            actual                                    file
## 1436 collectionID a double           http://grbio.org/cool/fra1-wrdm | http://grbio.org/cool/vg5r-57jw 'data/l_carianatus_GBIF_occurrence.txt'
## 1462 datasetID    1/0/T/F/TRUE/FALSE HerpDWC                                                           'data/l_carianatus_GBIF_occurrence.txt'
## 1463 datasetID    1/0/T/F/TRUE/FALSE HerpDWC                                                           'data/l_carianatus_GBIF_occurrence.txt'
## 1464 datasetID    1/0/T/F/TRUE/FALSE HerpDWC                                                           'data/l_carianatus_GBIF_occurrence.txt'
## 1542 collectionID a double           http://grbio.org/cool/6e5v-fta9                                   'data/l_carianatus_GBIF_occurrence.txt'
## .... ............ .................. ................................................................. .......................................
## See problems(...) for more details.
squam_occ <- read_delim("data/squamate_occ.txt", 
                         "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   gbifID = col_double(),
##   accessRights = col_character(),
##   bibliographicCitation = col_character(),
##   identifier = col_character(),
##   language = col_character(),
##   license = col_character(),
##   modified = col_datetime(format = ""),
##   publisher = col_character(),
##   references = col_character(),
##   type = col_character(),
##   institutionID = col_character(),
##   collectionID = col_character(),
##   institutionCode = col_character(),
##   collectionCode = col_character(),
##   datasetName = col_character(),
##   basisOfRecord = col_character(),
##   dynamicProperties = col_character(),
##   occurrenceID = col_character(),
##   catalogNumber = col_double(),
##   recordNumber = col_character()
##   # ... with 80 more columns
## )
## See spec(...) for full column specifications.
## Warning: 456165 parsing failures.
## row col    expected      actual                    file
##   1  -- 241 columns 240 columns 'data/squamate_occ.txt'
##   2  -- 241 columns 240 columns 'data/squamate_occ.txt'
##   3  -- 241 columns 240 columns 'data/squamate_occ.txt'
##   4  -- 241 columns 240 columns 'data/squamate_occ.txt'
##   5  -- 241 columns 240 columns 'data/squamate_occ.txt'
## ... ... ........... ........... .......................
## See problems(...) for more details.
dim(le_ca_occ)
## [1] 3975  241
unique(le_ca_occ$basisOfRecord)
## [1] "HUMAN_OBSERVATION"   "MACHINE_OBSERVATION" "PRESERVED_SPECIMEN" 
## [4] "FOSSIL_SPECIMEN"
unique(le_ca_occ$scientificName)
##  [1] "Leiocephalus carinatus Gray, 1827"                     
##  [2] "Leiocephalus carinatus aquarius Schwartz & Ogren, 1956"
##  [3] "Leiocephalus carinatus carinatus"                      
##  [4] "Leiocephalus carinatus armouri Barbour & Shreve, 1935" 
##  [5] "Leiocephalus carinatus varius Garman, 1887"            
##  [6] "Leiocephalus carinatus labrossytus Schwartz, 1959"     
##  [7] "Liocephalus carinatus Boulenger, 1885"                 
##  [8] "Leiocephalus carinatus hodsdoni"                       
##  [9] "Leiocephalus carinatus mogotensis Schwartz, 1959"      
## [10] "Leiocephalus carinatus coryi Schmidt, 1936"            
## [11] "Leiocephalus carinatus granti Rabb, 1957"              
## [12] "Leiocephalus carinatus virescens Stejneger, 1901"      
## [13] "Leiocephalus carinatus microcyon Schwartz, 1959"       
## [14] "Leiocephalus carinatus hodsoni Schmidt, 1936"          
## [15] "Liocephalus varius Garman, 1887"                       
## [16] "Leiocephalus carinatus cayensis Schwartz, 1959"        
## [17] "Leiocephalus carinatus zayasi Schwartz, 1959"
unique(le_ca_occ$specificEpithet)
## [1] "carinatus" "varius"
list.files('data/', full.names = FALSE)
##  [1] "air.mon.mean.nc"                  "citations.txt"                   
##  [3] "clean_le_ca_df.csv"               "dataset"                         
##  [5] "desktop.ini"                      "l_carianatus_GBIF_occurrence.txt"
##  [7] "l_carinatus_10km_thin1.csv"       "l_carinatus_1km_thin1.csv"       
##  [9] "l_carinatus_5km_thin1.csv"        "l_carinatus_IUCN.cpg"            
## [11] "l_carinatus_IUCN.dbf"             "l_carinatus_IUCN.prj"            
## [13] "l_carinatus_IUCN.shp"             "l_carinatus_IUCN.shx"            
## [15] "le_ca_pres_na.csv"                "meta.xml"                        
## [17] "metadata.xml"                     "multimedia.txt"                  
## [19] "rights.txt"                       "squamate_occ.txt"                
## [21] "test_L_carinatus.csv"             "tl_2016_12_cousub.cpg"           
## [23] "tl_2016_12_cousub.dbf"            "tl_2016_12_cousub.prj"           
## [25] "tl_2016_12_cousub.shp"            "tl_2016_12_cousub.shx"           
## [27] "verbatim.txt"
#Importing IUCN range map - We will use this to exclude all L. carinatus points outside of this area
le_ca_shapefile<-shapefile('data/l_carinatus_IUCN.shp')
plot(le_ca_shapefile)

projection(le_ca_shapefile)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
## Define required fields (GBIF gives a ton of superfluous variables)
gbif_fields = c("gbifID", "basisOfRecord", "catalogNumber", "collectionCode", "collectionID", "recordedBy", "continent", "countryCode", "stateProvince",  
                "county", "locality", "year", "month", "day", "decimalLatitude", "decimalLongitude", "coordinateUncertaintyInMeters", "establishmentMeans" , 
                "scientificName", "class", "order", "family", "genus", "specificEpithet", "infraspecificEpithet", "taxonomicStatus")

## Take the subset of fields from the data 
le_ca_reduced <- le_ca_occ[, gbif_fields]
squam_reduced <- squam_occ[, gbif_fields]

#Now I am reducing the bias layer data to see all of the families within the dataset
squam_reduced %>% 
  filter(family == " ") %>% 
  distinct(scientificName)
## # A tibble: 0 x 1
## # ... with 1 variable: scientificName <chr>
#Here I am subsetting to only include those families of interest (@Natalie, if you think some of these should be excluded, let me know)
squam_guild <- data.frame(
  squam_reduced %>% 
  filter(family == 'Agamidae' | family == 'Chamaeleonidae' | family == 'Cordylidae' |
           family == 'Corytophanidae' | family == 'Crotaphytidae' | family == 'Gerrhosauridae' |
           family == 'Iguanidae' | family == 'Lacertidae' | family == 'Leiocephalidae' |
           family == 'Phrynosomatidae' | family == 'Scincidae' | family == 'Teiidae' |
           family == 'Tropiduridae' | family == 'Varanidae')
)

# Because some of these records are not field collected specimens, we want to remove those
squam_guild_no_fossils <-  squam_guild %>% 
  filter(basisOfRecord != 'FOSSIL_SPECIMEN' & basisOfRecord != 'MACHINE_OBSERVATION')

## Remove occurrences with NA values. 
NA_index = which(is.na(le_ca_reduced$decimalLatitude) == TRUE | is.na(le_ca_reduced$decimalLatitude) == TRUE)
le_ca_reduced_no_NAs = le_ca_reduced[-NA_index, ]

Points on a map

###Creating a map
north_am_map <- ne_countries(scale = "large", continent = 'north america')

##Matching projections first
projection(north_am_map)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
projection(le_ca_shapefile)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
crs(north_am_map) <- crs(le_ca_shapefile) 

plot(north_am_map, axes=TRUE, col="light yellow",
     xlim = c(-90,-70),
     ylim = c(15, 30))
points(squam_guild_no_fossils$decimalLongitude, 
       squam_guild_no_fossils$decimalLatitude, 
       col='orange', pch=20, cex=0.75)
points(le_ca_reduced_no_NAs$decimalLongitude, 
        le_ca_reduced_no_NAs$decimalLatitude, 
        col='blue', pch=3, cex=0.75)
plot(le_ca_shapefile,add = TRUE)

Subsetting the maps

#subsetting the spatial data to only include florida
us_map <- ne_states(country = 'United States of America')
FL_map <- us_map[us_map$name == 'Florida',]
plot(FL_map)

#Subsetting the spatial data to only include Caribbean Islands
NR_map <- north_am_map[north_am_map$admin == 'The Bahamas' | 
               north_am_map$admin == 'Cayman Islands' |
               north_am_map$admin == 'Cuba',]
plot(NR_map)

#Matching projections
crs(FL_map) <- crs(le_ca_shapefile)
crs(NR_map) <- crs(le_ca_shapefile)

Clipping points by proper extents

#Converting to spatial dataframe
coordinates(le_ca_reduced_no_NAs) <- ~ decimalLongitude + decimalLatitude
coordinates(squam_guild_no_fossils) <- ~ decimalLongitude + decimalLatitude
head(le_ca_reduced_no_NAs)
## # A tibble: 6 x 24
##   gbifID basisOfRecord catalogNumber collectionCode collectionID recordedBy
##    <dbl> <chr>         <chr>         <chr>                 <dbl> <chr>     
## 1 2.58e9 HUMAN_OBSERV~ 40241530      Observations             NA doggoleo  
## 2 2.58e9 HUMAN_OBSERV~ 40250912      Observations             NA lizardtiff
## 3 2.58e9 HUMAN_OBSERV~ 40164474      Observations             NA Matt      
## 4 2.58e9 HUMAN_OBSERV~ 40164475      Observations             NA Matt      
## 5 2.58e9 HUMAN_OBSERV~ 40194513      Observations             NA reeseabro~
## 6 2.58e9 HUMAN_OBSERV~ 40105337      Observations             NA zara_natu~
## # ... with 18 more variables: continent <chr>, countryCode <chr>,
## #   stateProvince <chr>, county <chr>, locality <chr>, year <dbl>, month <dbl>,
## #   day <dbl>, coordinateUncertaintyInMeters <dbl>, establishmentMeans <chr>,
## #   scientificName <chr>, class <chr>, order <chr>, family <chr>, genus <chr>,
## #   specificEpithet <chr>, infraspecificEpithet <chr>, taxonomicStatus <chr>
class(le_ca_reduced_no_NAs)
## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"
#Changing the projection of occurrences to match the IUCN shapefile
crs(le_ca_reduced_no_NAs) <- crs(le_ca_shapefile)
crs(squam_guild_no_fossils) <- crs(le_ca_shapefile)
projection(squam_guild_no_fossils)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
#Using the IUCN shapefile to clip occurrences outside of those bounds
le_ca_reduced_no_NAs_IUCN_native_range <- le_ca_reduced_no_NAs[le_ca_shapefile,]
NR_squam_guild_no_fossils <- squam_guild_no_fossils[NR_map,]
FL_squam_guild_no_fossils <- squam_guild_no_fossils[FL_map,] # THis will be for when we incorporate FL into the models

Removing duplicate values/ saving final cleaned records

## Remove duplicate records for L_carinatus. 
le_ca_reduced_no_NAs_IUCN_native_range<-data.frame(le_ca_reduced_no_NAs_IUCN_native_range)
dups <- duplicated(le_ca_reduced_no_NAs_IUCN_native_range[, c("decimalLatitude", "decimalLongitude")])
dups_index = which(dups == TRUE)
le_ca_reduced_no_NAs_IUCN_native_range = le_ca_reduced_no_NAs_IUCN_native_range[-dups_index, ]
dim(le_ca_reduced_no_NAs_IUCN_native_range)
## [1] 364  27
## Visually inspect the final cleaned records
plot(NR_map, axes=TRUE, col="light yellow")
points(le_ca_reduced_no_NAs_IUCN_native_range$decimalLongitude, 
       le_ca_reduced_no_NAs_IUCN_native_range$decimalLatitude, 
       col='orange', pch=20, cex=0.75)

####### SAVE THE FINAL CLEANED OCCURRENCE FILE
# write.csv(le_ca_reduced_no_NAs_IUCN_native_range, "data/clean_le_ca_df.csv", row.names = FALSE)

NR_squam_guild_no_fossils<-data.frame(NR_squam_guild_no_fossils)
dups2 <- duplicated(NR_squam_guild_no_fossils[, c("decimalLatitude", "decimalLongitude")])
dups_index2 = which(dups2 == TRUE)
NR_squam_guild_no_fossils = NR_squam_guild_no_fossils[-dups_index2, ]
dim(NR_squam_guild_no_fossils)
## [1] 172  27
## Visually inspect the final cleaned records
plot(NR_map, axes=TRUE, col="light yellow")
points(NR_squam_guild_no_fossils$decimalLongitude, 
       NR_squam_guild_no_fossils$decimalLatitude, 
       col='orange', pch=20, cex=0.75)

####### SAVE THE FINAL CLEANED OCCURRENCE FILE
# write.csv(NR_squam_guild_no_fossils, "data/clean_NR_squam_guild_bias_layer_df.csv", row.names = FALSE)

FL_squam_guild_no_fossils<-data.frame(FL_squam_guild_no_fossils)
dups3 <- duplicated(FL_squam_guild_no_fossils[, c("decimalLatitude", "decimalLongitude")])
dups_index3 = which(dups3 == TRUE)
FL_squam_guild_no_fossils = FL_squam_guild_no_fossils[-dups_index3, ]
dim(FL_squam_guild_no_fossils)
## [1] 2269   27
## Visually inspect the final cleaned records
plot(FL_map, axes=TRUE, col="light yellow")
points(FL_squam_guild_no_fossils$decimalLongitude, 
       FL_squam_guild_no_fossils$decimalLatitude, 
       col='orange', pch=20, cex=0.75)

####### SAVE THE FINAL CLEANED OCCURRENCE FILE
# write.csv(FL_squam_guild_no_fossils, "data/clean_FL_squam_guild_bias_layer_df.csv", row.names = FALSE)

Importing the bioclim data

# Downloading/ cropping BIOclim variables
bioclim2.5 <- getData("worldclim",var="bio",res=2.5)
projection(bioclim2.5)
## [1] "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
projection(FL_map)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
projection(NR_map)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
#Changing projection of bioclim variables
crs(bioclim2.5) <- crs(NR_map)

# Cropping/ masking the bioclim variables for all of native range countries
NR_countries_bioclim2.5 <- crop(bioclim2.5, NR_map)
NR_countries_bioclim2.5 <- mask(NR_countries_bioclim2.5, NR_map)
plot(NR_countries_bioclim2.5$bio1)

FL_bioclim2.5 <- crop(bioclim2.5, FL_map)
FL_bioclim2.5 <- mask(FL_bioclim2.5, FL_map)
plot(FL_bioclim2.5$bio1)

Importing elevation data

#Showing the files to be imported
list.files('DEMs/', full.names = FALSE)
## [1] "10n090w_20101117_gmted_mea075.tif" "10n090w_20101117_gmted_mea300.tif"
## [3] "10n090w_20101117_gmted_med075.tif" "30n090w_20101117_gmted_mea075.tif"
## [5] "30n090w_20101117_gmted_mea300.tif" "30n090w_20101117_gmted_med075.tif"
#loading/ viewing the files
Caribbean_dem <- raster('DEMs/10n090w_20101117_gmted_mea300.tif')                 

US_dem <- raster('DEMs/30n090w_20101117_gmted_mea300.tif')

#Checking projections
projection(Caribbean_dem)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
projection(US_dem)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
projection(FL_bioclim2.5)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
#Cropping/ mapping the DEMs to the relevant areas
NR_dem <- crop(Caribbean_dem, NR_map)
NR_dem <- mask(NR_dem, NR_map)

FL_dem <- crop(mosaic(Caribbean_dem, US_dem, fun=mean), FL_map)
FL_dem <- mask(FL_dem, FL_map)

NR_dem_resampled_2.5 <- resample(NR_dem, NR_countries_bioclim2.5, method = 'bilinear')
plot(NR_dem_resampled_2.5)

FL_dem_resampled_2.5 <- resample(FL_dem, FL_bioclim2.5, method = 'bilinear')
plot(FL_dem_resampled_2.5)

# Stacking the rasters
NR_bioclim_dem_stack <- stack(NR_countries_bioclim2.5, NR_dem_resampled_2.5)
names(NR_bioclim_dem_stack)[20] <- "mean_elevation"

FL_bioclim_dem_stack <- stack(FL_bioclim2.5, FL_dem_resampled_2.5)
names(FL_bioclim_dem_stack)[20] <- "mean_elevation"

Importing impervious surface layer

#Importing/ cropping/ matching extents/ stacking impervious surface raster
imp_surf <- raster('gmis_impervious_surface_percentage_geographic_250m.tif')
FL_imp_surf <- crop(imp_surf, FL_bioclim_dem_stack)
FL_imp_surf_resam <- resample(FL_imp_surf, FL_bioclim_dem_stack$bio1, method ='bilinear')
plot(FL_imp_surf_resam)

FL_bioclim_dem__imp_stack <- stack(FL_bioclim_dem_stack, FL_imp_surf_resam)

NR_imp_surf <- crop(imp_surf, NR_bioclim_dem_stack)
NR_imp_surf_resam <- resample(NR_imp_surf, NR_bioclim_dem_stack$bio1, method = "bilinear")
NR_imp_surf_resam_mask <- mask(NR_imp_surf_resam, NR_bioclim_dem_stack$bio1)
plot(NR_imp_surf_resam_mask)

NR_bioclim_dem__imp_stack <- stack(NR_bioclim_dem_stack, NR_imp_surf_resam_mask)

Creating the actual bias layer

#First we need to run a function. This function is available in a package, but R will not let me open it, so I had to find the function itselt. Essentially, it just lets you to sample points from a raster based upon spatial probabilities.

sampleRast <- function(x, n, adjArea = TRUE, replace = TRUE, prob = TRUE) {
  
  val <- as.vector(x[[1]])
  
  # adjust probabilities for cell area and/or cell values
  if (adjArea) {
    
    areas <- raster::area(x[[1]], na.rm=TRUE)
    areas <- as.vector(areas)
    probs <- if (prob) {
      val * areas
    } else {
      areas
    }
    
  } else if (!adjArea & prob) {
    
    if (any(probs < 0)) warning('Some probabilities are < 0.', .immediate=TRUE)
    probs <- val
    
  } else if (!adjArea & !prob) {
    
    probs <- rep(1, length(val))
    
  }
  
  cellNum <- 1:raster::ncell(x)
  cellNum <- cellNum[!is.na(val)]
  
  probs <- probs[!is.na(val)]
  
  sites <- sample(cellNum, size=n, replace=replace, prob=probs)
  xy <- raster::xyFromCell(x, sites)
  
  xy
  
}

#Converting the dataframe to a coordinate object
coordinates(NR_squam_guild_no_fossils) <- ~ decimalLongitude + decimalLatitude
#Creating a raster of all point data
occur.ras <- rasterize(NR_squam_guild_no_fossils, NR_bioclim_dem_stack, 1)
plot(occur.ras)

#Creating a point dataframe
presences <- which(values(occur.ras) == 1)
pres.locs <- coordinates(occur.ras)[presences, ]
plot(pres.locs)

#Estimating kernel density. Here you want the limits to be set to the boundaries of your accessible area. 'h' refers to bandwidth. I think that we want a lower bandwidth, as this creates greater contrast between areas with and without sampling.
dens <- kde2d(pres.locs[,1], pres.locs[,2],
              h = rep(1,1),
              n = c(nrow(occur.ras), ncol(occur.ras)),
              lims = c(range(-84.95833, -72.75), range(19.25, 26.91667)))
dens.ras <- raster(dens)

# The resample command will change both the extent AND the resolution to match a desired file. Here, I want to make sure that the heatmap resolution matches the resolution of our bioclim raster
dens.ras_resam <- resample(dens.ras, NR_bioclim_dem_stack, method='bilinear')

#Now I want to make sure the projections match, and mask all values outside our area of interest
crs(dens.ras_resam) = crs(NR_bioclim_dem_stack$bio1)
dens.ras_resam_mask <- mask(dens.ras_resam, NR_bioclim_dem_stack$bio1)
plot(dens.ras_resam_mask)

#Now we have to put the above map in point form so that it can be run in the Maxent command

#I am using the function we created earlier. Essentially, we are selecting samples from our accessible area, only weighted by the density of Anuran occurrence points (from our heatmap). I am using 300 here, but when we run all this in higher resolution, wil will increase the number.
background_T<-sampleRast(dens.ras_resam_mask,300,adjArea=TRUE, replace=FALSE,prob=TRUE)
nrow(dens.ras_resam_mask)
## [1] 184
plot(dens.ras_resam_mask)
points(background_T)
plot(le_ca_shapefile, add = TRUE)

Removing presence points with NA predictor variable values

Hopefully we will be able to keep these later when we crank up the resolution. However, we want to make sure when we split our data into train/test data that we do not have lots of NA values
le_ca_pres <- data.frame(le_ca_reduced_no_NAs_IUCN_native_range$decimalLongitude, 
                         le_ca_reduced_no_NAs_IUCN_native_range$decimalLatitude)
colnames(le_ca_pres) <- c('lon', 'lat')

#First we find all values in our occurrence data with NA preidictor values
find_na <- raster::extract(NR_bioclim_dem__imp_stack, le_ca_pres)

#Binding that to the original dataframe to get locations
le_ca_pres_na <- cbind(le_ca_pres, find_na)

#You can see we lost about half of all those points (not a big deal given we still have more than enough)
dim(le_ca_pres_na)
## [1] 364  23
dim(na.omit(le_ca_pres_na))
## [1] 198  23
#Removing the rows with NA values, and then removing all predictor variables 
le_ca_pres_no_na_preds <- na.omit(le_ca_pres_na)
le_ca_pres_no_na_preds <- data.frame(le_ca_pres_no_na_preds$lon, le_ca_pres_no_na_preds$lat)
colnames(le_ca_pres_no_na_preds) <- c('lon', 'lat')
# write.csv(le_ca_pres_no_na_preds, 'data/le_ca_pres_na.csv')

Generating training/ test data

## Generate random number to separate training and testing occurrence data (70% training, 30% test)
i = sample(seq(1,nrow(le_ca_pres_no_na_preds)), 
           nrow(le_ca_pres_no_na_preds)* 0.30, replace = FALSE)

le_ca_pres_no_na_preds[, "scientificName"] <- "L_carinatus"

## training and testing data separated, only 3 
test_occr = le_ca_pres_no_na_preds[i, c("scientificName", "lon", "lat")]
train_occr = le_ca_pres_no_na_preds[-i, c("scientificName", "lon", "lat")]

dim(test_occr)
## [1] 59  3
dim(train_occr)
## [1] 139   3
# Saving these data so that we can access it later
# write.csv(test_occr, "data/test_L_carinatus.csv", row.names =FALSE)
# write.csv(train_occr, "data/train_L_carinatus.csv", row.names =FALSE)

## Get presences data but only latitiude and longitude. 
occur <- train_occr[,-1]
names(occur)<-c("lon","lat")

FInally, some models!

Here we will be using package dismo

#Subsetting that raster based on you a priori approach
NR_bioclim_dem__imp_stack_reduced<- stack(NR_bioclim_dem__imp_stack[[c(2,5,6,7,10,11,15,20,21),]])
FL_bioclim_dem__imp_stack_reduced <- stack(FL_bioclim_dem__imp_stack[[c(2,5,6,7,10,11,15,20,21),]])

#Matching all projections
crs(NR_bioclim_dem__imp_stack_reduced)="+proj=longlat +datum=WGS84"
crs(FL_bioclim_dem__imp_stack_reduced) ="+proj=longlat +datum=WGS84"

## Maxent arguments for dismo is listed here. https://urldefense.proofpoint.com/v2/url?u=https-3A__groups.google.com_forum_-23-21topic_maxent_yRBlvZ1-5F9rQ&d=DwIGaQ&c=sJ6xIWYx-zLMB3EPkvcnVg&r=nNnY6X5WY58RgeQXOKDQ61FWZnWJi1m_cldgGu59VxM&m=skM0YtecCzIghUzqVpUqwFdpGX3fReLIMh72UsSFje8&s=Ug_bbiC0ZCyJyEIlwiEMNQh8aW-vBSsd6uTAqY755Qw&e= . 
## only linear and regularization is 1 
arguments <- c("-J", "-P", "-q", "-p", "-h", "randomtestpoints=30", 
               "replicates=5", "betamultiplier=1", "askoverwrite=false", 
               "threads=6")

mod_lin1 <- maxent(x=NR_bioclim_dem__imp_stack_reduced, p=coordinates(occur), a = background_T, args=arguments) 
## Loading required namespace: rJava
#Running the code below will generate an html site that gives the results of the Maxent model
# mod_lin1 

#Using the model to predict over the native range (using the mean function to create a composite of all 5 replicates)
NR_Pred1 = predict(mod_lin1, NR_bioclim_dem__imp_stack_reduced)
plot(mean(NR_Pred1))

#Finding the threshold to set occurrence
mean_nr_pred <- mean(NR_Pred1)

pred_at_occ<- raster::extract(mean_nr_pred, coordinates(occur)) # Here we are extracting those relative (0-1) suitability values at each of our occurrence points
quantile(pred_at_occ, 0.05) #Now we are determining the 0.05 quantile, as we will set the presence/ absence threshold to that
##        5% 
## 0.3610937
#Predicting occurrence 
FL_predict1 = predict(mod_lin1,FL_bioclim_dem__imp_stack_reduced)
plot(mean(FL_predict1))

mean_fl_pred <- mean(FL_predict1)

#Setting the presence/ absence threshold to the 0.05 quantile calulated above
mean_fl_pred[mean_fl_pred <= quantile(pred_at_occ, 0.05) ] = 0
mean_fl_pred[mean_fl_pred >quantile(pred_at_occ, 0.05) ] = 1
plot(mean_fl_pred)

####As you can see, there is apparently very little area predicted as suitable in FL. However,I think a lot of this is driven the the %impervious surface raster. So, for the second model, we will remove that. We can talk more about it, but I have a feeling that not having very much impervious surface is penalizing predicted suitability too severely

MODEL 2

#Subsetting that raster based on you a priori approach
NR_bioclim_no_imp_reduced<- stack(NR_bioclim_dem__imp_stack[[c(2,5,6,7,10,11,15,20),]])
FL_bioclim_no_imp_reduced <- stack(FL_bioclim_dem__imp_stack[[c(2,5,6,7,10,11,15,20),]])

#Matching all projections
crs(NR_bioclim_no_imp_reduced)="+proj=longlat +datum=WGS84"
crs(FL_bioclim_no_imp_reduced) ="+proj=longlat +datum=WGS84"

## only linear and regularization is 1 
arguments <- c("-J", "-P", "-q", "-p", "-h", "randomtestpoints=30", 
               "replicates=5", "betamultiplier=1", "askoverwrite=false", 
               "threads=6")

mod_lin2 <- maxent(x=NR_bioclim_no_imp_reduced, p=coordinates(occur), a = background_T, args=arguments) 
#Running the code below will generate an html site that gives the results of the Maxent model
# mod_lin2 

#Using the model to predict over the native range (using the mean function to create a composite of all 5 replicates)
NR_Pred2 = predict(mod_lin2, NR_bioclim_no_imp_reduced)
plot(mean(NR_Pred2))

#Finding the threshold to set occurrence
mean_nr_pred2 <- mean(NR_Pred2)

pred_at_occ2<- raster::extract(mean_nr_pred2, coordinates(occur)) # Here we are extracting those relative (0-1) suitability values at each of our occurrence points
quantile(pred_at_occ2, 0.05) #Now we are determining the 0.05 quantile, as we will set the presence/ absence threshold to that
##        5% 
## 0.4027691
#Predicting occurrence 
FL_predict2 = predict(mod_lin2,FL_bioclim_no_imp_reduced)
plot(mean(FL_predict2))

mean_fl_pred2 <- mean(FL_predict2)

#Setting the presence/ absence threshold to the 0.05 quantile calulated above
mean_fl_pred2[mean_fl_pred2 <= quantile(pred_at_occ, 0.05) ] = 0
mean_fl_pred2[mean_fl_pred2 >quantile(pred_at_occ, 0.05) ] = 1
plot(mean_fl_pred2)

Model 3: with some precipitation

For this model, I removed two variables with no explanatory power, and then added a couple of precipitation variables (Precip in wettest quarter and Precip in warmest quarter)
NR_bioclim_precip_reduced<- stack(NR_bioclim_dem__imp_stack[[c(2,5,7,11,13,15,18,20),]])
FL_bioclim_precip_reduced <- stack(FL_bioclim_dem__imp_stack[[c(2,5,7,11,13,15,18,20),]])

#Matching all projections
crs(NR_bioclim_precip_reduced)="+proj=longlat +datum=WGS84"
crs(FL_bioclim_precip_reduced) ="+proj=longlat +datum=WGS84"

## only linear and regularization is 1 
arguments <- c("-J", "-P", "-q", "-p", "-h", "randomtestpoints=30", 
               "replicates=5", "betamultiplier=1", "askoverwrite=false", 
               "threads=6")

mod_lin3 <- maxent(x=NR_bioclim_precip_reduced, p=coordinates(occur), a = background_T, args=arguments) 
#Running the code below will generate an html site that gives the results of the Maxent model
# mod_lin3 

#Using the model to predict over the native range (using the mean function to create a composite of all 5 replicates)
NR_Pred3 = predict(mod_lin3, NR_bioclim_precip_reduced)
plot(mean(NR_Pred3))

#Finding the threshold to set occurrence
mean_nr_pred3 <- mean(NR_Pred3)

pred_at_occ3<- raster::extract(mean_nr_pred3, coordinates(occur)) # Here we are extracting those relative (0-1) suitability values at each of our occurrence points
quantile(pred_at_occ3, 0.05) #Now we are determining the 0.05 quantile, as we will set the presence/ absence threshold to that
##        5% 
## 0.3629274
#Predicting occurrence 
FL_predict3 = predict(mod_lin3,FL_bioclim_precip_reduced)
plot(mean(FL_predict3))

mean_fl_pred3 <- mean(FL_predict3)

#Setting the presence/ absence threshold to the 0.05 quantile calulated above
mean_fl_pred3[mean_fl_pred3 <= quantile(pred_at_occ, 0.05) ] = 0
mean_fl_pred3[mean_fl_pred3 >quantile(pred_at_occ, 0.05) ] = 1
plot(mean_fl_pred3)

#I think part of the issue here is the loss of Bio6 (min temp of coldest month). While this variable (as well as mean temp of coldest quarter) are not super important in the models so far, this is largely because there is not that much variation in these variables in the native range. But, Obviously we know this is going to be a drastic limiting factor based upon the CT data alone.

Model evaluation

coming soon!