#Set working directory
setwd("~/Documents/University_of_Minnesota/Nielsen_Lab/github/ecological-niche-modeling/")

#Set random seed
set.seed(14114)

#Load in the data
crir<-read.csv("data/Blasto_Summary2.csv",as.is = TRUE, na.strings = c( "", " "))
colnames(crir)
##  [1] "Sample.ID"             "Blasto.Present..Y.N." 
##  [3] "Present.is.1"          "Histo.Present..Y.N."  
##  [5] "Histo.Present.is.1"    "Region"               
##  [7] "Date"                  "Time"                 
##  [9] "Waypoint.ID"           "Latitude"             
## [11] "Longitude"             "Elevation..m."        
## [13] "Site.Classification"   "Groundcover"          
## [15] "Canopy.Classification" "Distance.to.Water"    
## [17] "Water.Classification"  "Landform"             
## [19] "Nearby.Fecal.Matter"   "Rotting.Wood.Present" 
## [21] "Sample.Type"           "Soil.pH"              
## [23] "Sample.Notes"          "Location.Notes"
dim(crir) #rows x columns
## [1] 90 24
#Avg detection of Blastomyces
b.avg.detection <- sum(crir$Blasto.Present..Y.N. == "Y")/length(crir$Blasto.Present..Y.N.)
b.avg.detection #52.22%
## [1] 0.5222222
#Avg detection of Histoplasma
h.avg.detection <- sum(crir$Histo.Present..Y.N. == "Y")/length(crir$Histo.Present..Y.N.)
h.avg.detection #46.67%
## [1] 0.4666667
colnames(crir)[2]<-"BAD1_DNA_present"

#Read in White Wilderness Data
wwt<-read.csv("data/WWT_Summary.csv",as.is=TRUE,na.strings=c(""," ")) #No Blasto data
dim(wwt)
## [1] 52 20
wwt$BAD1_DNA_present<-rep("ND",nrow(wwt))

arr<-read.csv("data/Arrowhead_Summary.csv",na.strings=c(""," "))
dim(arr)
## [1] 168  26
#Extract longitude/latitude coordinates
fix_coords<-function(x){
  x$Latitude<-as.numeric(gsub("N","",x$Latitude))
  x$Longitude<-as.numeric(gsub("W","",x$Longitude))*(-1)
  return(x)
}

crir<-fix_coords(crir)
wwt<-fix_coords(wwt)
arr<-fix_coords(arr)

#Extract coordinates only for mapping purposes
coord_subset<-function(x){
  x<-x[,colnames(x) %in% c("BAD1_DNA_present","Latitude","Longitude")]
  return(x)
}

crir.sub<-coord_subset(crir)
wwt.sub<-coord_subset(wwt)
arr.sub<-coord_subset(arr)
arr.sub$BAD1_DNA_present<-gsub("NA","ND",arr.sub$BAD1_DNA_present)

#Merge datasets
mapdat<-rbind(crir.sub,wwt.sub,arr.sub)
meanlong<-mean(mapdat$Longitude)
meanlong #mean longitude
## [1] -92.08908
meanlat<-mean(mapdat$Latitude)
meanlat #mean latitude
## [1] 47.53175
#Draw map of sampling locations
mp <- get_map(location = c(long=meanlong-0.15,lat=meanlat), maptype = "terrain", source = "google", zoom = 8,filename = "minnesota")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=47.531751,-92.239084&zoom=8&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
mapdat$BAD1_colors<-mapdat$BAD1_DNA_present
mapdat$BAD1_colors<-gsub("ND","black",mapdat$BAD1_colors)
mapdat$BAD1_colors<-gsub("N","blue",mapdat$BAD1_colors)
mapdat$BAD1_colors<-gsub("Y","red",mapdat$BAD1_colors)


mplot<-ggmap(mp) + 
  geom_point(data = mapdat, aes(x=Longitude,y=Latitude,color=BAD1_colors))+
  scale_colour_manual(name = 'Blastomyces present',values=levels(factor(mapdat$BAD1_colors)),
                      labels=c("ND","N","Y"))

mplot

#Find and fix the outlier in Lake Superior
arr[arr$BAD1_DNA_present=="ND" & arr$Latitude < 47.1 & arr$Longitude > -91.4,]
##    Sample.ID Random.Non.random   Date Latitude Longitude Elevation..ft.
## 41   A16-21R                 R 6/6/16 47.03755 -91.38154            867
## 42  A16-21NR                NR 6/6/16 47.03755 -91.38154            867
##    Site.classification      Groundcover    Canopy Distance.to.water..m.
## 41         residential maintained grass no canopy      no visible water
## 42         residential maintained grass no canopy      no visible water
##    Water.Classification Landform Fecal.Matter Rotting.wood.summary
## 41                  n/a     flat         none                    Y
## 42                  n/a     flat         none                    Y
##    Rotting.Wood..Logs. Rotting.Wood..Branches.
## 41                   N                       Y
## 42                   N                       Y
##    Rotting.Wood..twigs.small.debris. Rotting.Wood..Processed.wood.
## 41                                 Y                             N
## 42                                 Y                             N
##          Sample.Type   pH Sample.ID.1 DNA_extraction_attempted
## 41              soil 6.80     A16-21R                        Y
## 42 vegetative debris 6.68    A16-21NR                        Y
##    DNA_concentration Limit_of_detection..ng.ml. BAD1_DNA_present
## 41             143.1                         ND               ND
## 42              20.5                         ND               ND
##    X...comments
## 41         <NA>
## 42         <NA>
mapdat.n<-mapdat[mapdat$BAD1_DNA_present=="ND" & mapdat$Latitude < 47.1 & mapdat$Longitude > -91.4,]
mapdat.tr<-mapdat[-(as.numeric(rownames(mapdat.n))),]

mplot2<-ggmap(mp) + 
  geom_point(data = mapdat.tr, aes(x=Longitude,y=Latitude,color=BAD1_colors))+
  scale_colour_manual(name = 'Blastomyces present',values=levels(factor(mapdat.tr$BAD1_colors)),
                      labels=c("ND","N","Y"))

mplot2

#Subset data to include variables of interest
#Site.classification vs. "Site.Classification"
#"Rotting.wood.summary" vs. Rotting.Wood.Present
#"Elevation..ft." vs. "Elevation..m."
#"Canopy" vs. Canopy.Classification
#Soil.pH vs. pH
#Distance.to.water..m. vs. Distance.to.Water
#Fecal.Matter vs. Nearby.Fecal.Matter
fix_colnames<-function(x){
  colnames(x)[which(colnames(x)=="Rotting.wood.summary")] <- "Rotting.Wood.Present"
  colnames(x)[which(colnames(x)=="Site.classification")] <-"Site.Classification"
  colnames(x)[which(colnames(x)=="Canopy")] <-"Canopy.Classification"
  colnames(x)[which(colnames(x)=="Distance.to.water..m.")]<-"Distance.to.Water"
  colnames(x)[which(colnames(x)=="Elevation..ft.")]<-"Elevation..m."
  colnames(x)[which(colnames(x)=="Fecal.Matter")]<-"Nearby.Fecal.Matter"
  colnames(x)[which(colnames(x)=="pH")]<-"Soil.pH"
  return(x)
}  

#fix_colnames<-function(x){
  #colnames(x)[colnames(x) %in% c("Rotting.wood.summary","pH","Site.classification","Canopy",
                     #"Distance.to.water..m.","Elevation..ft.",
                     #"Fecal.Matter")] <-c("Elevation..m.","Site.Classification",
                                         #"Canopy.Classification","Distance.to.Water",
                                         #"Nearby.Fecal.Matter","Rotting.Wood.Present","Soil.pH")
  #return(x)                                         
#}

##Fix column names to match the CR/IR dataset
arr.c<-fix_colnames(arr)
arr.c$Region<-"ARR" #Add the arrowhead region classification
wwt.c<-fix_colnames(wwt)
wwt.c$Region<-"WWT" #Add the white wilderness sampling classification


##Need to adjust elevation from ft to meters (multiply feet by 0.3048)
fix_elevation<-function(x){
  x$Elevation..m.<-round((x$Elevation..m. * 0.3048),2)
  return(x)
}
arr.f<-fix_elevation(arr.c)
wwt.f<-fix_elevation(wwt.c)

#Subset variables for predictors we are interested in
var_subset<-function(x){
  x<-x[,colnames(x) %in% c("BAD1_DNA_present","Latitude","Longitude",
                           "Site.Classification","Groundcover","Canopy.Classification",
                           "Elevation..m.","Distance.to.Water","Water.Classification",
                           "Landform","Nearby.Fecal.Matter","Rotting.Wood.Present",
                           "Sample.Type","Soil.pH")]
  return(x)
}

crir.v<-var_subset(crir)
wwt.v<-var_subset(wwt.f)
arr.v<-var_subset(arr.f)

#Merge the data
mergdat<-rbind(crir.v,wwt.v,arr.v) #all data
dim(mergdat)
## [1] 310  14
#310 observations of 14 variables

#Print uncleaned data
for(i in 6:ncol(mergdat)-1){
  cat(paste0("=========",colnames(mergdat)[i],"========="))
  print(table(mergdat[,i]))
  cat("\n")
}
## =========Site.Classification=========
##   Bog/Marsh   Creekside      forest      Forest   grassland   Grassland 
##          14           2         116          55          30          14 
## residential Residential    Roadside  Shoreline        urban     wetland 
##           8           4           1           6           4           2 
##    wetland  
##          54 
## 
## =========Groundcover=========
##         bare soil         Bare Soil            forest            Forest 
##                33                 2                74                26 
##         Grassland            Gravel  maintained grass  Maintained Grass 
##                15                 1                 8                 1 
##              Moss natural grassland              Sand vegetative debris 
##                10                36                 2                 7 
## Vegetative Debris           wetland         Woodchips 
##                32                62                 1 
## 
## =========Canopy.Classification=========
##              6           Full    full canopy             No      no canopy 
##              2              8              2             24            111 
##        Partial partial canopy 
##             58            105 
## 
## =========Distance.to.Water=========
##               <1                0                1               10 
##               66               30               38               12 
##                2                3                4                5 
##                2               12                2               15 
##                6 no visible water                x 
##                2               80               51 
## 
## =========Water.Classification=========
##    Bog/Marsh        Creek         lake         Lake        lake  
##            8            2           10            7            2 
## large puddle Large Puddle          n/a         pond         Pond 
##           24            6           80           12            5 
##        river        River small puddle Small Puddle       stream 
##           26            6            4            7           14 
##      wetland            x 
##           48           49 
## 
## =========Landform=========
##        cliff        Creek       creek          flat gentle slope 
##            4            2            4          122           36 
## Gentle Slope      hilltop          Pit    shoreline    Shoreline 
##           67            2            2           42           14 
##  steep slope  Steep Slope      valley  
##            4            1            6 
## 
## =========Nearby.Fecal.Matter=========
##  dropping droppings Droppings      none         x 
##         2        10         5       208         1 
## 
## =========Rotting.Wood.Present=========
##                  N                 N                  No 
##                 76                  1                 23 
## Twigs/Small Debris                  Y                Yes 
##                 11                143                 56 
## 
## =========Sample.Type=========
##          Ash/Soil      Fecal Matter              moss       mulch chips 
##                 1                 2                 1                 1 
##         peat moss         Peat Moss      Plant Debris      Rotting Wood 
##                 1                 4                 3                10 
##     rotting wood               Sand           seaweed              soil 
##                54                 2                 1               123 
##              Soil             soil  vegetative debris 
##                68                11                28
#Clean the data for predictive modeling
source("src/clean_data.r")
cleandat<-clean_data(mergdat)

#Print cleaned data
for(i in 6:ncol(mergdat)-1){
  cat(paste0("=========",colnames(cleandat)[i],"========="))
  print(table(cleandat[,i]))
  cat("\n")
}
## =========Site.Classification=========
##      forest   grassland residential   shoreline     wetland 
##         171          44          17           6          72 
## 
## =========Groundcover=========
##            forest         grassland              moss  soil/sand/gravel 
##               100                60                10                38 
## vegetative debris           wetland 
##                40                62 
## 
## =========Canopy.Classification=========
##    full      no partial 
##      10     137     163 
## 
## =========Distance.to.Water=========
##   <1    0 1-10    x 
##   66   30   83  131 
## 
## =========Water.Classification=========
##    lake    pond  puddle   river  stream wetland       x 
##      19      17      41      34      14      56     129 
## 
## =========Landform=========
##      cliff      creek       flat  shoreline      slope valley/pit 
##          4          6        122         56        110          8 
##          x 
##          4 
## 
## =========Nearby.Fecal.Matter=========
## droppings      none 
##        17       293 
## 
## =========Rotting.Wood.Present=========
##   N   Y 
## 100 210 
## 
## =========Sample.Type=========
##      fecal matter         peat moss      rotting wood         soil/sand 
##                 2                 6                64               205 
## vegetative debris 
##                33
#Get data for which we have points
compdat<-cleandat[cleandat$BAD1_DNA_present=="Y" | cleandat$BAD1_DNA_present=="N",]

#Define the response variable as a boolean (1 for "Y", 0 for "N")
compdat$BAD1_DNA_present<-ifelse(test=compdat$BAD1_DNA_present=="Y",1,0)
compdat$BAD1_DNA_present
##   [1] 1 1 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0
##  [36] 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1
##  [71] 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 1
###Continuous variables

#Elevation
hist(compdat$Elevation..m.) #negative skew

summary(compdat$Elevation..m.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.892   6.038   6.082   6.004   6.163   6.205
#pH
hist(compdat$Soil.pH) #looks relatively normal

summary(compdat$Soil.pH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.040   5.210   5.705   5.636   6.200   7.880
corfit<-lm(compdat$Soil.pH ~ compdat$Elevation..m.)
plot(compdat$Soil.pH ~ compdat$Elevation..m.) #no correlation

summary(corfit) #p=0.875, no correlation   
## 
## Call:
## lm(formula = compdat$Soil.pH ~ compdat$Elevation..m.)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.59780 -0.41994  0.06647  0.56337  2.24480 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            5.47483    1.02170   5.359 3.49e-07 ***
## compdat$Elevation..m.  0.02678    0.16974   0.158    0.875    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8461 on 136 degrees of freedom
## Multiple R-squared:  0.0001831,  Adjusted R-squared:  -0.007169 
## F-statistic: 0.0249 on 1 and 136 DF,  p-value: 0.8748
which(is.na(compdat)==T) #verifying no NAs
## integer(0)
#Split into test and training sets
sample.ind<-sample(2,nrow(compdat),replace=T,prob=c(2/3,1/3)) #splitting dataset
train<-compdat[sample.ind==1,] #training set
dim(train) #93 observations in training set
## [1] 91 14
test<-compdat[sample.ind==2,] #test set
dim(test) #45 observations in test set
## [1] 47 14

Generalized linear model

##############################################################################
model<-as.formula(paste(colnames(train)[1],"~",paste(colnames(train)[2:dim(train)[2]],collapse="+")))
model
## BAD1_DNA_present ~ Latitude + Longitude + Elevation..m. + Site.Classification + 
##     Groundcover + Canopy.Classification + Distance.to.Water + 
##     Water.Classification + Landform + Nearby.Fecal.Matter + Rotting.Wood.Present + 
##     Sample.Type + Soil.pH
#train$BAD1_DNA_present<-as.numeric(as.character(train$BAD1_DNA_present))

glm.fit<-glm(model,family=binomial(link="logit"),data=train)
#glm.fit
summary.aov(aov(glm.fit))
##                       Df Sum Sq Mean Sq F value Pr(>F)  
## Latitude               1  0.384  0.3844   1.729 0.1938  
## Longitude              1  0.923  0.9230   4.151 0.0463 *
## Elevation..m.          1  0.096  0.0957   0.430 0.5145  
## Site.Classification    3  1.047  0.3491   1.570 0.2066  
## Groundcover            5  1.039  0.2078   0.934 0.4658  
## Canopy.Classification  2  0.316  0.1579   0.710 0.4958  
## Distance.to.Water      3  0.116  0.0386   0.173 0.9139  
## Water.Classification   5  1.394  0.2788   1.254 0.2965  
## Landform               5  2.708  0.5415   2.435 0.0454 *
## Nearby.Fecal.Matter    1  1.337  1.3366   6.011 0.0173 *
## Rotting.Wood.Present   1  0.014  0.0135   0.061 0.8061  
## Sample.Type            4  0.678  0.1694   0.762 0.5545  
## Soil.pH                1  0.022  0.0222   0.100 0.7530  
## Residuals             57 12.675  0.2224                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test$BAD1_DNA_present<-as.numeric(as.character(test$BAD1_DNA_present))

y.hat<-predict(glm.fit,newdata=test[,2:ncol(test)],type = "response")
y.hat.test<-ifelse(y.hat>0.5,1,0)
y.test<-test[,1]

#Mean squared error
mean((y.hat.test-y.test)^2)
## [1] 0.3829787

Classification tree

##############################################################################
train$BAD1_DNA_present<-factor(train$BAD1_DNA_present)
test$BAD1_DNA_present<-factor(test$BAD1_DNA_present)

tr<-tree(model,data=train)
summary(tr)
## 
## Classification tree:
## tree(formula = model, data = train)
## Variables actually used in tree construction:
## [1] "Longitude"             "Site.Classification"   "Water.Classification" 
## [4] "Latitude"              "Elevation..m."         "Distance.to.Water"    
## [7] "Canopy.Classification" "Groundcover"           "Soil.pH"              
## Number of terminal nodes:  13 
## Residual mean deviance:  0.8519 = 66.45 / 78 
## Misclassification error rate: 0.2088 = 19 / 91
#testing error
y.hat<-predict(tr,newdata=test[,2:ncol(test)],type="class")
y.hat.test<-as.numeric(as.character(y.hat))

y.test<-test[,1]
y.test<-as.numeric(as.character(y.test))

#calculate misclassification error
mean((y.hat.test-y.test)^2)
## [1] 0.3404255
plot(tr)
text(tr,pretty=0)

randomForest

##############################################################################
train[,1]<-factor(train[,1])

rf<-randomForest(model,data=train,ntree=2000,
                 importance=TRUE)

rf
## 
## Call:
##  randomForest(formula = model, data = train, ntree = 2000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 36.26%
## Confusion matrix:
##    0  1 class.error
## 0 28 18   0.3913043
## 1 15 30   0.3333333
plot(rf) #try ntree = 800
legend("topright", colnames(rf$err.rate),col=1:4,cex=0.8,fill=1:4)

#Bar plot of variable importance
par(mar=c(10,10,1,1))
barplot(sort(rf$importance[,3])[2:ncol(train)],horiz = T,beside = T,
        las=2, xlab="Mean Decrease in Accuracy",  
        col=colorRampPalette(c("blue", "red"))(13))

#Calculate training error
yhat.train<-as.numeric(as.character(rf$predicted))
#yhat.train<-ifelse(yhat.train>0.5,1,0)
y.train<-as.numeric(as.character(train[,1]))

mean((yhat.train-y.train)^2) 
## [1] 0.3626374
#Test error
yhat.test<-as.numeric(as.character(predict(rf,newdata=test[,2:ncol(test)])))
yhat.test<-ifelse(yhat.test>0.5,1,0)
y.test<-as.numeric(as.character(test[,1]))
mean((yhat.test-y.test)^2) 
## [1] 0.212766
varImpPlot(rf)

#Now we can tune our model further
tuneRF(x = train[,c(2:ncol(train))],y=train[,1],plot = T,
       ntreeTry = 900,improve=0.001,trace=T)
## mtry = 3  OOB error = 37.36% 
## Searching left ...
## mtry = 2     OOB error = 35.16% 
## 0.05882353 0.001 
## mtry = 1     OOB error = 35.16% 
## 0 0.001 
## Searching right ...
## mtry = 6     OOB error = 34.07% 
## 0.03125 0.001 
## mtry = 12    OOB error = 35.16% 
## -0.03225806 0.001

##        mtry  OOBError
## 1.OOB     1 0.3516484
## 2.OOB     2 0.3516484
## 3.OOB     3 0.3736264
## 6.OOB     6 0.3406593
## 12.OOB   12 0.3516484
rf2<-randomForest(model,data=train,ntree=900,mtry=6,
                 importance=TRUE)
rf2 
## 
## Call:
##  randomForest(formula = model, data = train, ntree = 900, mtry = 6,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 900
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 35.16%
## Confusion matrix:
##    0  1 class.error
## 0 28 18   0.3913043
## 1 14 31   0.3111111
yhat.test<-as.numeric(as.character(predict(rf2,newdata=test[,2:ncol(test)])))
yhat.test<-ifelse(yhat.test>0.5,1,0)
y.test<-as.numeric(as.character(test[,1]))
mean((yhat.test-y.test)^2)  
## [1] 0.212766

Boosting

##############################################################################
train$BAD1_DNA_present<-as.logical(as.numeric(as.character(train$BAD1_DNA_present)))

#Default parameters
brt2<-gbm.step(gbm.y=1,gbm.x=c(2:ncol(train)),
               learning.rate=0.005,data=train,                   
               family="bernoulli",verbose=F)
## 
##  
##  GBM STEP - version 2.9 
##  
## Performing cross-validation optimisation of a boosted regression tree model 
## for BAD1_DNA_present and using a family of bernoulli 
## Using 91 observations and 13 predictors 
## creating 10 initial models of 50 trees 
## 
##  folds are stratified by prevalence 
## total mean deviance =  1.3862 
## tolerance is fixed at  0.0014 
## now adding trees...
## fitting final gbm model with a fixed number of 500 trees for BAD1_DNA_present

#Try simplifying our model
brt.simp<-gbm.simplify(brt2,n.drops=5,)
## gbm.simplify - version 2.9 
## simplifying gbm.step model for BAD1_DNA_present with 13 predictors and 91 observations 
## original deviance = 1.3064(0.0496)
## a fixed number of 5 drops will be tested
## creating initial models...
## dropping predictor:
## 1
## 2
## 3
## 4
## 5
## 
## processing final dropping of variables with full data
## 1-Rotting.Wood.Present
## 2-Nearby.Fecal.Matter
## 3-Sample.Type
## 4-Distance.to.Water
## 5-Canopy.Classification

brt.simp$pred.list[[1]]
##  [1]  2  3  4  5  6  7  8  9 10 11 13 14
colnames(train)[12]
## [1] "Rotting.Wood.Present"
#Try our model with fewer variables
brt3<-gbm.step(gbm.y=1,gbm.x=brt.simp$pred.list[[1]],data=train,
               learning.rate=0.001,verbose=F) 
## 
##  
##  GBM STEP - version 2.9 
##  
## Performing cross-validation optimisation of a boosted regression tree model 
## for BAD1_DNA_present and using a family of bernoulli 
## Using 91 observations and 12 predictors 
## creating 10 initial models of 50 trees 
## 
##  folds are stratified by prevalence 
## total mean deviance =  1.3862 
## tolerance is fixed at  0.0014 
## now adding trees...
## fitting final gbm model with a fixed number of 3000 trees for BAD1_DNA_present

summary(brt3)

##                                         var    rel.inf
## Latitude                           Latitude 27.6367382
## Longitude                         Longitude 20.0083178
## Elevation..m.                 Elevation..m. 19.4435227
## Water.Classification   Water.Classification  8.5127872
## Site.Classification     Site.Classification  6.4272720
## Landform                           Landform  6.3205537
## Groundcover                     Groundcover  5.9768954
## Soil.pH                             Soil.pH  4.5001276
## Canopy.Classification Canopy.Classification  0.6905100
## Sample.Type                     Sample.Type  0.3753184
## Distance.to.Water         Distance.to.Water  0.1079570
## Nearby.Fecal.Matter     Nearby.Fecal.Matter  0.0000000
#Predict with our test data
y.hat <- predict.gbm(brt3, test,
                     n.trees=brt3$gbm.call$best.trees, 
                     type="response")
y.hat.test<-ifelse(y.hat > 0.5,1,0)
y.hat.test
##  [1] 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 1 0 1
y.test<-as.numeric(as.character(test[,1]))
y.test
##  [1] 1 0 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1
## [36] 0 0 0 0 0 0 0 0 0 1 0 0
#Calculate mean squared error
mean((y.hat.test-y.test)^2) 
## [1] 0.1914894
################################################################
#Try appling our model to unknown data

unkdat<-cleandat[cleandat$BAD1_DNA_present=="NA" | cleandat$BAD1_DNA_present=="ND",]
setwd("~/Documents/University_of_Minnesota/Nielsen_Lab/github/ecological-niche-modeling/")
source("src/clean_data.r")
unkdat<-clean_data(unkdat)
dim(unkdat)
## [1] 172  14
#BRT model applied
yhat.brt <- predict.gbm(brt3, newdata=unkdat[,2:ncol(unkdat)],
                     n.trees=brt3$gbm.call$best.trees, 
                     type="response")
yhat.brt<-ifelse(yhat.brt>0.5,1,0)

sum(yhat.brt)/length(yhat.brt) 
## [1] 0.5639535
################################################################
#Mapping our results

unkdat$BAD1_colors<-yhat.brt
unkdat$BAD1_colors<-ifelse(yhat.brt==1,"red","blue")

#Find and fix the outlier in Lake Superior
unkdat.n<-unkdat[!unkdat$Latitude==47.03755 & !unkdat$Longitude==-91.38154,]

mplot2<-ggmap(mp) + 
  geom_point(data = unkdat.n, aes(x=Longitude,y=Latitude,color=BAD1_colors))+
  scale_colour_manual(name = 'Blastomyces present',values=c("blue","red"),
                      labels=c("N","Y"))
mplot2