Practical 4 Species Distribution Modelling I

setwd("P:/GEOG70922/Week5")




install.packages(c("dismo","maptools","glmnet","maxnet","raster","sp"))



library(dismo)

files <- list.files(path=paste(system.file(package="dismo"),
                               '/ex', sep=''), pattern='grd', full.names=TRUE )

files




env<-stack(files[1:8])


#use the names() function to label each covariate layer

names(env) <- c("MeanTemp.","AnnualPrecip.","Precip.WettestQuarter","PrecipDriestQuarter","Max.Temp.","Min.Temp.","Temp.Range","MeanTemp.WettestQuarter")



plot(env)

library(maptools)
 data(wrld_simpl)

#And now plot:
 # first layer of the RasterStack
 plot(env, 1)
 # note the "add=TRUE" argument with plot
 plot(wrld_simpl,add=TRUE)




bradypus <- gbif("Bradypus", "variegatus*",sp=TRUE)



wgs84<-crs.latlong<-crs("+init=epsg:4326")


crs(bradypus)


plot(env, 1)
 
plot(wrld_simpl,add=TRUE)

plot(bradypus,add=TRUE,col='red')



View(wrld_simpl@data)



SA<-subset(wrld_simpl,wrld_simpl$SUBREGION==5|wrld_simpl$SUBREGION==13)#subset the world map to South and Central America

plot(SA)


#Set the bradypus crs to that of the SA object created above.
crs(bradypus)<-crs(SA)

#subset our points to only those inside SA
bradypus<-bradypus[SA, ]

plot(SA)

plot(bradypus, add=TRUE)





set.seed(11)

#create random points on cells of the 'env' object within the extent of bradypus and avoiding cells containing points from bradypus

back.xy <- randomPoints(env, n=1000,p= bradypus,ext = extent(bradypus))


#Create Spatial Points layer from back.xy

back<-SpatialPoints(back.xy,crs(bradypus))

#make sure crs matches other data

crs(back)<-crs(SA)


#subset to SA object

back<-back[SA,]

plot(SA)

plot(bradypus,add=TRUE)

plot(back,add=TRUE, col='red')




#absence values

eA<-extract(env,back)

#presence values
eP<-extract(env,bradypus)

#create data frames from values
Pres.cov<-data.frame(eP,Pres=1)


Back.cov<-data.frame(eA,Pres=0)


#combine both data sets using the rbind() function (binds datsets with same columns by stacking one on top of the other - i.e. joining rows together)
all.cov<-rbind(Pres.cov,Back.cov)



head(all.cov)
tail(all.cov)
summary(all.cov)




bc_bradypus <- bioclim(Pres.cov[,c('MeanTemp.', 'Temp.Range', 'AnnualPrecip.')])


bc_bradypus  





bioclim.map <- predict(env, bc_bradypus)
plot(bioclim.map, axes = F, box = F, main = "bioclim: MeanTemp.,Temp.Range,AnnualPrecip.")




bc_bradypus2<-bioclim(Pres.cov[,1:8])


bioclim.map2 <- predict(env, bc_bradypus2)
plot(bioclim.map2, axes = F, box = F, main = "bioclimAll")


response(bc_bradypus2)


#Specify model
glm.bradypus<-glm(Pres~.,binomial(link='logit'), data=all.cov)


#get summary of outputs - regression coefficients are log(odds) where positive values denote a positive relationship between the predictor variable and the liklihood of species presence (i.e. the liklihood that Pres =1).

summary(glm.bradypus)






#Use the predict function to estimate probabilities for our study area based on the raster stack object 'env'. Note the "response" argument tells the predict function to return values as probabilities, i.e. from 0 to 1).  
glm.map <- predict(env, glm.bradypus, type = "response")

#plot the map
plot(glm.map,main="glmBradypus")




 library(maxnet)
 library(glmnet)
 maxnet.ppm <- maxnet(all.cov$Pres, all.cov[,1:8],maxnet.formula(all.cov$Pres,all.cov[,1:8],classes="lq")) # runs a presence-only Maxent model selecting feature "linear" and "quadratic" using the "lq" argument.  


 


maxnet.cloglog.map <- predict(env, maxnet.ppm, clamp=F, type="cloglog") # make predictions to map and transform values with the clog-log argument (similar to logistic transform) to get range 0 to 1.

plot(maxnet.cloglog.map, axes=F, box=F, main="Maxnet-clog")



#For evaluating the first bioclim model we only povide the MeanTemp.,Temp.Range and AnnualPrecip. predictors which are columns 1,2 and 7 in the Pres.cov data frame



############### model validation

eBC <- evaluate(Pres.cov[,c(1,2,7)],Back.cov,bc_bradypus)

#For model two validation, provide all predictors
eBC2 <- evaluate(Pres.cov,Back.cov,bc_bradypus2)

#inspect both evaluation reults

eBC

eBC2




 plot(eBC,'ROC') #plot the receiver operatoring characteristic curves

 plot(eBC2,'ROC')



#GLM evaluation

#Model with all predictors
eglmAll <- evaluate(all.cov[all.cov$Pres==1,],all.cov[all.cov$Pres==0,], glm.bradypus)


#inspect
eglmAll
plot(eglmAll,'ROC')



#Repeat for Maxnet

 maxeval<-evaluate(p=bradypus, a=back, maxnet.ppm, env)

 maxeval

 plot(maxeval,'ROC')



#Kfold validation approach 
 
set.seed(5) #ensure random selection of cases in each fold is replicable

#set number of folds to use
folds=5

#partiction presence and absence data according to folds using the kfold() function.

kfold_pres <- kfold(Pres.cov, folds)
kfold_back <- kfold(Back.cov, folds)


#create an empty list to hold our results (remember there will be five sets)
bc_K<-list()
par(mfrow=c(2,3))
# for loop to iterate over folds
for (i in 1:folds) {
  train <- Pres.cov[kfold_pres!= i,]
  test <- Pres.cov[kfold_pres == i,]
  backTrain<-Back.cov[kfold_back!=i,]
  backTest<-Back.cov[kfold_back==i,]
  dataTrain<-rbind(train,backTrain)
  dataTest<-rbind(test,backTest)
  bc_bradypus_K<-bioclim(train[,1:8])#for bioclimm we only need presence data to train the model!
  bc_K[[i]] <- evaluate(p=test[,1:8],a=backTest[,1:8], bc_bradypus_K)#use testing data (kfold==i) for model evaluation
  
  #check the AUC by plotting ROC values
  
  plot(bc_K[[i]],'ROC')
  
}


#inspect

bc_K


#get the AUC from the evaluation reults

aucBIO <- sapply( bc_K, function(x){slot(x, 'auc')} )


#calculate the mean AUC value for comparison with other models


mean(aucBIO)

Opt_BIO<-sapply(bc_K, function(x){ x@t[which.max(x@TPR + x@TNR)] } )

Opt_BIO





#take the mean of the OptBIO list for application in our predictions

Mean_OptBIO<- mean(Opt_BIO)



Mean_OptBIO



prBIO <- predict(env, bc_bradypus_K,type = "response")
par(mfrow=c(1,2))
plot(prBIO, main='BIO_K')
plot(wrld_simpl, add=TRUE, border='dark grey')
plot(prBIO > Mean_OptBIO, main='presence/absence')
plot(wrld_simpl,add=TRUE, border ='dark grey')





#create an empty list to hold our results (remember there will be five sets)
eGLM<-list()
par(mfrow=c(2,3))

for (i in 1:folds) {
  train <- Pres.cov[kfold_pres!= i,]
  test <- Pres.cov[kfold_pres == i,]
  backTrain<-Back.cov[kfold_back!=i,]
  backTest<-Back.cov[kfold_back==i,]
  dataTrain<-rbind(train,backTrain)
  dataTest<-rbind(test,backTest)
  glm_eval <- glm(Pres~.,binomial(link = "logit"), data=dataTrain)#this is our glm model trained on presence and absence points
  eGLM[[i]] <- evaluate(p=dataTest[ which(dataTest$Pres==1),],a=dataTest[which(dataTest$Pres==0),], glm_eval)#use testing data (kfold==i) for model evaluation
  
  #check the AUC by plotting ROC values
  
  plot(eGLM[[i]],'ROC')
  
}


#inspect

eGLM


aucGLM <- sapply( eGLM, function(x){slot(x, 'auc')} )

#calculate the mean values for comparison with other models


mean(aucGLM)


Opt_GLM<-sapply( eGLM, function(x){ x@t[which.max(x@TPR + x@TNR)] } )

Opt_GLM



#take the mean to be applied to our predictions

Mean_OptGLM<- mean(Opt_GLM)

trGLM<-plogis(Mean_OptGLM)

trGLM


prGLM <- predict(env, glm_eval,type = "response")
par(mfrow=c(1,2))
plot(prGLM, main='GLM, regression')
plot(wrld_simpl, add=TRUE, border='dark grey')
plot(prGLM > trGLM, main='presence/absence')
plot(wrld_simpl,add=TRUE, border ='dark grey')



###################################IPP data prep



# select a large number of background points using the sampleRegular() function (note that with this function we have the option of returning an spatial points option using 'sp=TRUE')

BG<-sampleRegular(env,20000,xy=FALSE,sp=TRUE)



BG<-spTransform(BG,crs(bradypus)) #align CRS


backPPM<-BG[SA,] #subset to study area



#create Pres variable and assign '0'
backPPM$Pres=0



back.covPPM<-data.frame(backPPM)


back.covPPM<-back.covPPM[,1:9] # remove unwanted columns





all.covPPM<-rbind(Pres.cov,back.covPPM) # rbind for new data frame

head(all.cov)


# specify the gim model assigned very large weights to background points
glm.ppm <- glm(Pres ~. , family = binomial(link=logit), weights = 10^(1-Pres),
               data = all.covPPM)

summary(glm.ppm) # summary of regression





eGLMppm<-list()
par(mfrow=c(2,3))

for (i in 1:folds) {
  train <- Pres.cov[kfold_pres!= i,]
  test <- Pres.cov[kfold_pres == i,]
  backTrain<-back.covPPM[kfold_back!=i,]
  backTest<-back.covPPM[kfold_back==i,]
  dataTrain<-rbind(train,backTrain)
  dataTest<-rbind(test,backTest)
  glm_evalPPM <- glm(Pres ~. , family = binomial(link=logit), weights = 10^(1-Pres), data=dataTrain) # weight background points to approximate a point proces model 
  eGLMppm[[i]] <- evaluate(p=dataTest[ which(dataTest$Pres==1),],a=dataTest[which(dataTest$Pres==0),], glm_evalPPM)
  plot(eGLMppm[[i]],'ROC')
  
}

aucGLMppm <- sapply( eGLMppm, function(x){slot(x, 'auc')} )
aucGLMppm
mean(aucGLMppm)

TNR_GLMppm<-sapply( eGLMppm, function(x){ x@t[which.max(x@TPR + x@TNR)] } )

TNR_GLMppm

Mean_TNR_GLMppm<- mean(TNR_GLMppm)

Mean_TNR_GLMppm



#the glmPPM model seems to work well only inside the extent of of the bradypus point data set so clip the output to this extent:
bradExtent<-extent(bradypus)

prPPM <- predict(env, glm_evalPPM,type = "link",ext=bradExtent)
par(mfrow=c(1,2))
plot(prPPM, main='GLMppm, regression')
plot(wrld_simpl, add=TRUE, border='dark grey')
tr <- Mean_TNR_GLMppm
plot(prPPM > tr, main='presence/absence')
plot(wrld_simpl, add=TRUE, border='dark grey')








#We can perform the same evaluation process on our maxent model also (we will leave k-fold validation of the #lesser-performing bioclim model).




par(mfrow=c(2,3))

eMAX<-list()

folds=5

kfold_pres <- kfold(Pres.cov, folds)
kfold_back <- kfold(Back.cov, folds)


for (i in 1:folds) {
  train <- Pres.cov[kfold_pres!= i,]
  test <- Pres.cov[kfold_pres == i,]
  backTrain<-Back.cov[kfold_back!=i,]
  backTest<-Back.cov[kfold_back==i,]
  dataTrain<-rbind(train,backTrain)
  dataTest<-rbind(test,backTest)
  maxnet_eval <- maxnet(dataTrain$Pres, dataTrain[,1:8],regmult=3)
  eMAX[[i]] <- evaluate(p=dataTest[which(dataTest$Pres==1),],a=dataTest[which(dataTest$Pres==0),], maxnet_eval)
  plot(eMAX[[i]],'ROC')
  
}



#Again, we can access our AUC and MaxTPR+TNR statistics for this latest model (for reference, the equivalent AUC #evaluation for the same analysis using the stand-alone Maxent package was 0.901).




aucMAX <- sapply( eMAX, function(x){slot(x, 'auc')} )

#calculate the mean values for coparison with other models

aucMAX

mean(aucMAX)


#Get maxTPR+TNR for the maxnet model

Opt_MAX<-sapply( eMAX, function(x){ x@t[which.max(x@TPR + x@TNR)] } )

Opt_MAX

Mean_OptMAX<-mean(Opt_MAX)

Mean_OptMAX

# The threshold values for the maxnet model relate to the raw outputs (prior to transformation). We can acheive our #presence/absence map simply by applying our threshold to these raw values. To do so, we map the raw results of the #model (i.e, this time we do not opt for a 'cloglog' or other transform) and then remove all values below our #threshold.




prMAX <- predict(env, maxnet_eval)
par(mfrow=c(1,2))
plot(prMAX, main='Maxent Prediction')
plot(wrld_simpl, add=TRUE, border='dark grey')
plot(prMAX > Mean_OptMAX, main='presence/absence')
plot(wrld_simpl,add=TRUE, border ='dark grey')
Practical 4 Species Distribution Modelling I

MD

20/02/2020