#Set working directory
setwd("~/Documents/University_of_Minnesota/Nielsen_Lab/github/ecological-niche-modeling/")
#Set random seed
set.seed(14114)
#Load in the data
crir<-read.csv("data/Blasto_Summary2.csv",as.is = TRUE, na.strings = c( "", " "))
colnames(crir)
## [1] "Sample.ID" "Blasto.Present..Y.N."
## [3] "Present.is.1" "Histo.Present..Y.N."
## [5] "Histo.Present.is.1" "Region"
## [7] "Date" "Time"
## [9] "Waypoint.ID" "Latitude"
## [11] "Longitude" "Elevation..m."
## [13] "Site.Classification" "Groundcover"
## [15] "Canopy.Classification" "Distance.to.Water"
## [17] "Water.Classification" "Landform"
## [19] "Nearby.Fecal.Matter" "Rotting.Wood.Present"
## [21] "Sample.Type" "Soil.pH"
## [23] "Sample.Notes" "Location.Notes"
dim(crir) #rows x columns
## [1] 90 24
#Avg detection of Blastomyces
b.avg.detection <- sum(crir$Blasto.Present..Y.N. == "Y")/length(crir$Blasto.Present..Y.N.)
b.avg.detection #52.22%
## [1] 0.5222222
#Avg detection of Histoplasma
h.avg.detection <- sum(crir$Histo.Present..Y.N. == "Y")/length(crir$Histo.Present..Y.N.)
h.avg.detection #46.67%
## [1] 0.4666667
colnames(crir)[2]<-"BAD1_DNA_present"
#Read in White Wilderness Data
wwt<-read.csv("data/WWT_Summary.csv",as.is=TRUE,na.strings=c(""," ")) #No Blasto data
dim(wwt)
## [1] 52 20
wwt$BAD1_DNA_present<-rep("ND",nrow(wwt))
arr<-read.csv("data/Arrowhead_Summary.csv",na.strings=c(""," "))
dim(arr)
## [1] 168 26
#Extract longitude/latitude coordinates
fix_coords<-function(x){
x$Latitude<-as.numeric(gsub("N","",x$Latitude))
x$Longitude<-as.numeric(gsub("W","",x$Longitude))*(-1)
return(x)
}
crir<-fix_coords(crir)
wwt<-fix_coords(wwt)
arr<-fix_coords(arr)
#Extract coordinates only for mapping purposes
coord_subset<-function(x){
x<-x[,colnames(x) %in% c("BAD1_DNA_present","Latitude","Longitude")]
return(x)
}
crir.sub<-coord_subset(crir)
wwt.sub<-coord_subset(wwt)
arr.sub<-coord_subset(arr)
arr.sub$BAD1_DNA_present<-gsub("NA","ND",arr.sub$BAD1_DNA_present)
#Merge datasets
mapdat<-rbind(crir.sub,wwt.sub,arr.sub)
meanlong<-mean(mapdat$Longitude)
meanlong #mean longitude
## [1] -92.08908
meanlat<-mean(mapdat$Latitude)
meanlat #mean latitude
## [1] 47.53175
#Draw map of sampling locations
mp <- get_map(location = c(long=meanlong-0.15,lat=meanlat), maptype = "terrain", source = "google", zoom = 8,filename = "minnesota")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=47.531751,-92.239084&zoom=8&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
mapdat$BAD1_colors<-mapdat$BAD1_DNA_present
mapdat$BAD1_colors<-gsub("ND","black",mapdat$BAD1_colors)
mapdat$BAD1_colors<-gsub("N","blue",mapdat$BAD1_colors)
mapdat$BAD1_colors<-gsub("Y","red",mapdat$BAD1_colors)
mplot<-ggmap(mp) +
geom_point(data = mapdat, aes(x=Longitude,y=Latitude,color=BAD1_colors))+
scale_colour_manual(name = 'Blastomyces present',values=levels(factor(mapdat$BAD1_colors)),
labels=c("ND","N","Y"))
mplot

#Find and fix the outlier in Lake Superior
arr[arr$BAD1_DNA_present=="ND" & arr$Latitude < 47.1 & arr$Longitude > -91.4,]
## Sample.ID Random.Non.random Date Latitude Longitude Elevation..ft.
## 41 A16-21R R 6/6/16 47.03755 -91.38154 867
## 42 A16-21NR NR 6/6/16 47.03755 -91.38154 867
## Site.classification Groundcover Canopy Distance.to.water..m.
## 41 residential maintained grass no canopy no visible water
## 42 residential maintained grass no canopy no visible water
## Water.Classification Landform Fecal.Matter Rotting.wood.summary
## 41 n/a flat none Y
## 42 n/a flat none Y
## Rotting.Wood..Logs. Rotting.Wood..Branches.
## 41 N Y
## 42 N Y
## Rotting.Wood..twigs.small.debris. Rotting.Wood..Processed.wood.
## 41 Y N
## 42 Y N
## Sample.Type pH Sample.ID.1 DNA_extraction_attempted
## 41 soil 6.80 A16-21R Y
## 42 vegetative debris 6.68 A16-21NR Y
## DNA_concentration Limit_of_detection..ng.ml. BAD1_DNA_present
## 41 143.1 ND ND
## 42 20.5 ND ND
## X...comments
## 41 <NA>
## 42 <NA>
mapdat.n<-mapdat[mapdat$BAD1_DNA_present=="ND" & mapdat$Latitude < 47.1 & mapdat$Longitude > -91.4,]
mapdat.tr<-mapdat[-(as.numeric(rownames(mapdat.n))),]
mplot2<-ggmap(mp) +
geom_point(data = mapdat.tr, aes(x=Longitude,y=Latitude,color=BAD1_colors))+
scale_colour_manual(name = 'Blastomyces present',values=levels(factor(mapdat.tr$BAD1_colors)),
labels=c("ND","N","Y"))
mplot2

#Subset data to include variables of interest
#Site.classification vs. "Site.Classification"
#"Rotting.wood.summary" vs. Rotting.Wood.Present
#"Elevation..ft." vs. "Elevation..m."
#"Canopy" vs. Canopy.Classification
#Soil.pH vs. pH
#Distance.to.water..m. vs. Distance.to.Water
#Fecal.Matter vs. Nearby.Fecal.Matter
fix_colnames<-function(x){
colnames(x)[which(colnames(x)=="Rotting.wood.summary")] <- "Rotting.Wood.Present"
colnames(x)[which(colnames(x)=="Site.classification")] <-"Site.Classification"
colnames(x)[which(colnames(x)=="Canopy")] <-"Canopy.Classification"
colnames(x)[which(colnames(x)=="Distance.to.water..m.")]<-"Distance.to.Water"
colnames(x)[which(colnames(x)=="Elevation..ft.")]<-"Elevation..m."
colnames(x)[which(colnames(x)=="Fecal.Matter")]<-"Nearby.Fecal.Matter"
colnames(x)[which(colnames(x)=="pH")]<-"Soil.pH"
return(x)
}
#fix_colnames<-function(x){
#colnames(x)[colnames(x) %in% c("Rotting.wood.summary","pH","Site.classification","Canopy",
#"Distance.to.water..m.","Elevation..ft.",
#"Fecal.Matter")] <-c("Elevation..m.","Site.Classification",
#"Canopy.Classification","Distance.to.Water",
#"Nearby.Fecal.Matter","Rotting.Wood.Present","Soil.pH")
#return(x)
#}
##Fix column names to match the CR/IR dataset
arr.c<-fix_colnames(arr)
arr.c$Region<-"ARR" #Add the arrowhead region classification
wwt.c<-fix_colnames(wwt)
wwt.c$Region<-"WWT" #Add the white wilderness sampling classification
##Need to adjust elevation from ft to meters (multiply feet by 0.3048)
fix_elevation<-function(x){
x$Elevation..m.<-round((x$Elevation..m. * 0.3048),2)
return(x)
}
arr.f<-fix_elevation(arr.c)
wwt.f<-fix_elevation(wwt.c)
#Subset variables for predictors we are interested in
var_subset<-function(x){
x<-x[,colnames(x) %in% c("BAD1_DNA_present","Latitude","Longitude",
"Site.Classification","Groundcover","Canopy.Classification",
"Elevation..m.","Distance.to.Water","Water.Classification",
"Landform","Nearby.Fecal.Matter","Rotting.Wood.Present",
"Sample.Type","Soil.pH")]
return(x)
}
crir.v<-var_subset(crir)
wwt.v<-var_subset(wwt.f)
arr.v<-var_subset(arr.f)
#Merge the data
mergdat<-rbind(crir.v,wwt.v,arr.v) #all data
dim(mergdat)
## [1] 310 14
#310 observations of 14 variables
#Print uncleaned data
for(i in 6:ncol(mergdat)-1){
cat(paste0("=========",colnames(mergdat)[i],"========="))
print(table(mergdat[,i]))
cat("\n")
}
## =========Site.Classification=========
## Bog/Marsh Creekside forest Forest grassland Grassland
## 14 2 116 55 30 14
## residential Residential Roadside Shoreline urban wetland
## 8 4 1 6 4 2
## wetland
## 54
##
## =========Groundcover=========
## bare soil Bare Soil forest Forest
## 33 2 74 26
## Grassland Gravel maintained grass Maintained Grass
## 15 1 8 1
## Moss natural grassland Sand vegetative debris
## 10 36 2 7
## Vegetative Debris wetland Woodchips
## 32 62 1
##
## =========Canopy.Classification=========
## 6 Full full canopy No no canopy
## 2 8 2 24 111
## Partial partial canopy
## 58 105
##
## =========Distance.to.Water=========
## <1 0 1 10
## 66 30 38 12
## 2 3 4 5
## 2 12 2 15
## 6 no visible water x
## 2 80 51
##
## =========Water.Classification=========
## Bog/Marsh Creek lake Lake lake
## 8 2 10 7 2
## large puddle Large Puddle n/a pond Pond
## 24 6 80 12 5
## river River small puddle Small Puddle stream
## 26 6 4 7 14
## wetland x
## 48 49
##
## =========Landform=========
## cliff Creek creek flat gentle slope
## 4 2 4 122 36
## Gentle Slope hilltop Pit shoreline Shoreline
## 67 2 2 42 14
## steep slope Steep Slope valley
## 4 1 6
##
## =========Nearby.Fecal.Matter=========
## dropping droppings Droppings none x
## 2 10 5 208 1
##
## =========Rotting.Wood.Present=========
## N N No
## 76 1 23
## Twigs/Small Debris Y Yes
## 11 143 56
##
## =========Sample.Type=========
## Ash/Soil Fecal Matter moss mulch chips
## 1 2 1 1
## peat moss Peat Moss Plant Debris Rotting Wood
## 1 4 3 10
## rotting wood Sand seaweed soil
## 54 2 1 123
## Soil soil vegetative debris
## 68 11 28
#Clean the data for predictive modeling
source("src/clean_data.r")
cleandat<-clean_data(mergdat)
#Print cleaned data
for(i in 6:ncol(mergdat)-1){
cat(paste0("=========",colnames(cleandat)[i],"========="))
print(table(cleandat[,i]))
cat("\n")
}
## =========Site.Classification=========
## forest grassland residential shoreline wetland
## 171 44 17 6 72
##
## =========Groundcover=========
## forest grassland moss soil/sand/gravel
## 100 60 10 38
## vegetative debris wetland
## 40 62
##
## =========Canopy.Classification=========
## full no partial
## 10 137 163
##
## =========Distance.to.Water=========
## <1 0 1-10 x
## 66 30 83 131
##
## =========Water.Classification=========
## lake pond puddle river stream wetland x
## 19 17 41 34 14 56 129
##
## =========Landform=========
## cliff creek flat shoreline slope valley/pit
## 4 6 122 56 110 8
## x
## 4
##
## =========Nearby.Fecal.Matter=========
## droppings none
## 17 293
##
## =========Rotting.Wood.Present=========
## N Y
## 100 210
##
## =========Sample.Type=========
## fecal matter peat moss rotting wood soil/sand
## 2 6 64 205
## vegetative debris
## 33
#Get data for which we have points
compdat<-cleandat[cleandat$BAD1_DNA_present=="Y" | cleandat$BAD1_DNA_present=="N",]
#Define the response variable as a boolean (1 for "Y", 0 for "N")
compdat$BAD1_DNA_present<-ifelse(test=compdat$BAD1_DNA_present=="Y",1,0)
compdat$BAD1_DNA_present
## [1] 1 1 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0
## [36] 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1
## [71] 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 1
###Continuous variables
#Elevation
hist(compdat$Elevation..m.) #negative skew

summary(compdat$Elevation..m.)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.892 6.038 6.082 6.004 6.163 6.205
#pH
hist(compdat$Soil.pH) #looks relatively normal

summary(compdat$Soil.pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.040 5.210 5.705 5.636 6.200 7.880
corfit<-lm(compdat$Soil.pH ~ compdat$Elevation..m.)
plot(compdat$Soil.pH ~ compdat$Elevation..m.) #no correlation

summary(corfit) #p=0.875, no correlation
##
## Call:
## lm(formula = compdat$Soil.pH ~ compdat$Elevation..m.)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.59780 -0.41994 0.06647 0.56337 2.24480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.47483 1.02170 5.359 3.49e-07 ***
## compdat$Elevation..m. 0.02678 0.16974 0.158 0.875
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8461 on 136 degrees of freedom
## Multiple R-squared: 0.0001831, Adjusted R-squared: -0.007169
## F-statistic: 0.0249 on 1 and 136 DF, p-value: 0.8748
which(is.na(compdat)==T) #verifying no NAs
## integer(0)
#Split into test and training sets
sample.ind<-sample(2,nrow(compdat),replace=T,prob=c(2/3,1/3)) #splitting dataset
train<-compdat[sample.ind==1,] #training set
dim(train) #93 observations in training set
## [1] 91 14
test<-compdat[sample.ind==2,] #test set
dim(test) #45 observations in test set
## [1] 47 14
Generalized linear model
##############################################################################
model<-as.formula(paste(colnames(train)[1],"~",paste(colnames(train)[2:dim(train)[2]],collapse="+")))
model
## BAD1_DNA_present ~ Latitude + Longitude + Elevation..m. + Site.Classification +
## Groundcover + Canopy.Classification + Distance.to.Water +
## Water.Classification + Landform + Nearby.Fecal.Matter + Rotting.Wood.Present +
## Sample.Type + Soil.pH
#train$BAD1_DNA_present<-as.numeric(as.character(train$BAD1_DNA_present))
glm.fit<-glm(model,family=binomial(link="logit"),data=train)
#glm.fit
summary.aov(aov(glm.fit))
## Df Sum Sq Mean Sq F value Pr(>F)
## Latitude 1 0.384 0.3844 1.729 0.1938
## Longitude 1 0.923 0.9230 4.151 0.0463 *
## Elevation..m. 1 0.096 0.0957 0.430 0.5145
## Site.Classification 3 1.047 0.3491 1.570 0.2066
## Groundcover 5 1.039 0.2078 0.934 0.4658
## Canopy.Classification 2 0.316 0.1579 0.710 0.4958
## Distance.to.Water 3 0.116 0.0386 0.173 0.9139
## Water.Classification 5 1.394 0.2788 1.254 0.2965
## Landform 5 2.708 0.5415 2.435 0.0454 *
## Nearby.Fecal.Matter 1 1.337 1.3366 6.011 0.0173 *
## Rotting.Wood.Present 1 0.014 0.0135 0.061 0.8061
## Sample.Type 4 0.678 0.1694 0.762 0.5545
## Soil.pH 1 0.022 0.0222 0.100 0.7530
## Residuals 57 12.675 0.2224
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test$BAD1_DNA_present<-as.numeric(as.character(test$BAD1_DNA_present))
y.hat<-predict(glm.fit,newdata=test[,2:ncol(test)],type = "response")
y.hat.test<-ifelse(y.hat>0.5,1,0)
y.test<-test[,1]
#Mean squared error
mean((y.hat.test-y.test)^2)
## [1] 0.3829787
Classification tree
##############################################################################
train$BAD1_DNA_present<-factor(train$BAD1_DNA_present)
test$BAD1_DNA_present<-factor(test$BAD1_DNA_present)
tr<-tree(model,data=train)
summary(tr)
##
## Classification tree:
## tree(formula = model, data = train)
## Variables actually used in tree construction:
## [1] "Longitude" "Site.Classification" "Water.Classification"
## [4] "Latitude" "Elevation..m." "Distance.to.Water"
## [7] "Canopy.Classification" "Groundcover" "Soil.pH"
## Number of terminal nodes: 13
## Residual mean deviance: 0.8519 = 66.45 / 78
## Misclassification error rate: 0.2088 = 19 / 91
#testing error
y.hat<-predict(tr,newdata=test[,2:ncol(test)],type="class")
y.hat.test<-as.numeric(as.character(y.hat))
y.test<-test[,1]
y.test<-as.numeric(as.character(y.test))
#calculate misclassification error
mean((y.hat.test-y.test)^2)
## [1] 0.3404255
plot(tr)
text(tr,pretty=0)

randomForest
##############################################################################
train[,1]<-factor(train[,1])
rf<-randomForest(model,data=train,ntree=2000,
importance=TRUE)
rf
##
## Call:
## randomForest(formula = model, data = train, ntree = 2000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 36.26%
## Confusion matrix:
## 0 1 class.error
## 0 28 18 0.3913043
## 1 15 30 0.3333333
plot(rf) #try ntree = 800
legend("topright", colnames(rf$err.rate),col=1:4,cex=0.8,fill=1:4)

#Bar plot of variable importance
par(mar=c(10,10,1,1))
barplot(sort(rf$importance[,3])[2:ncol(train)],horiz = T,beside = T,
las=2, xlab="Mean Decrease in Accuracy",
col=colorRampPalette(c("blue", "red"))(13))

#Calculate training error
yhat.train<-as.numeric(as.character(rf$predicted))
#yhat.train<-ifelse(yhat.train>0.5,1,0)
y.train<-as.numeric(as.character(train[,1]))
mean((yhat.train-y.train)^2)
## [1] 0.3626374
#Test error
yhat.test<-as.numeric(as.character(predict(rf,newdata=test[,2:ncol(test)])))
yhat.test<-ifelse(yhat.test>0.5,1,0)
y.test<-as.numeric(as.character(test[,1]))
mean((yhat.test-y.test)^2)
## [1] 0.212766
varImpPlot(rf)

#Now we can tune our model further
tuneRF(x = train[,c(2:ncol(train))],y=train[,1],plot = T,
ntreeTry = 900,improve=0.001,trace=T)
## mtry = 3 OOB error = 37.36%
## Searching left ...
## mtry = 2 OOB error = 35.16%
## 0.05882353 0.001
## mtry = 1 OOB error = 35.16%
## 0 0.001
## Searching right ...
## mtry = 6 OOB error = 34.07%
## 0.03125 0.001
## mtry = 12 OOB error = 35.16%
## -0.03225806 0.001

## mtry OOBError
## 1.OOB 1 0.3516484
## 2.OOB 2 0.3516484
## 3.OOB 3 0.3736264
## 6.OOB 6 0.3406593
## 12.OOB 12 0.3516484
rf2<-randomForest(model,data=train,ntree=900,mtry=6,
importance=TRUE)
rf2
##
## Call:
## randomForest(formula = model, data = train, ntree = 900, mtry = 6, importance = TRUE)
## Type of random forest: classification
## Number of trees: 900
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 35.16%
## Confusion matrix:
## 0 1 class.error
## 0 28 18 0.3913043
## 1 14 31 0.3111111
yhat.test<-as.numeric(as.character(predict(rf2,newdata=test[,2:ncol(test)])))
yhat.test<-ifelse(yhat.test>0.5,1,0)
y.test<-as.numeric(as.character(test[,1]))
mean((yhat.test-y.test)^2)
## [1] 0.212766
Boosting
##############################################################################
train$BAD1_DNA_present<-as.logical(as.numeric(as.character(train$BAD1_DNA_present)))
#Default parameters
brt2<-gbm.step(gbm.y=1,gbm.x=c(2:ncol(train)),
learning.rate=0.005,data=train,
family="bernoulli",verbose=F)
##
##
## GBM STEP - version 2.9
##
## Performing cross-validation optimisation of a boosted regression tree model
## for BAD1_DNA_present and using a family of bernoulli
## Using 91 observations and 13 predictors
## creating 10 initial models of 50 trees
##
## folds are stratified by prevalence
## total mean deviance = 1.3862
## tolerance is fixed at 0.0014
## now adding trees...
## fitting final gbm model with a fixed number of 500 trees for BAD1_DNA_present

#Try simplifying our model
brt.simp<-gbm.simplify(brt2,n.drops=5,)
## gbm.simplify - version 2.9
## simplifying gbm.step model for BAD1_DNA_present with 13 predictors and 91 observations
## original deviance = 1.3064(0.0496)
## a fixed number of 5 drops will be tested
## creating initial models...
## dropping predictor:
## 1
## 2
## 3
## 4
## 5
##
## processing final dropping of variables with full data
## 1-Rotting.Wood.Present
## 2-Nearby.Fecal.Matter
## 3-Sample.Type
## 4-Distance.to.Water
## 5-Canopy.Classification

brt.simp$pred.list[[1]]
## [1] 2 3 4 5 6 7 8 9 10 11 13 14
colnames(train)[12]
## [1] "Rotting.Wood.Present"
#Try our model with fewer variables
brt3<-gbm.step(gbm.y=1,gbm.x=brt.simp$pred.list[[1]],data=train,
learning.rate=0.001,verbose=F)
##
##
## GBM STEP - version 2.9
##
## Performing cross-validation optimisation of a boosted regression tree model
## for BAD1_DNA_present and using a family of bernoulli
## Using 91 observations and 12 predictors
## creating 10 initial models of 50 trees
##
## folds are stratified by prevalence
## total mean deviance = 1.3862
## tolerance is fixed at 0.0014
## now adding trees...
## fitting final gbm model with a fixed number of 3000 trees for BAD1_DNA_present

summary(brt3)

## var rel.inf
## Latitude Latitude 27.6367382
## Longitude Longitude 20.0083178
## Elevation..m. Elevation..m. 19.4435227
## Water.Classification Water.Classification 8.5127872
## Site.Classification Site.Classification 6.4272720
## Landform Landform 6.3205537
## Groundcover Groundcover 5.9768954
## Soil.pH Soil.pH 4.5001276
## Canopy.Classification Canopy.Classification 0.6905100
## Sample.Type Sample.Type 0.3753184
## Distance.to.Water Distance.to.Water 0.1079570
## Nearby.Fecal.Matter Nearby.Fecal.Matter 0.0000000
#Predict with our test data
y.hat <- predict.gbm(brt3, test,
n.trees=brt3$gbm.call$best.trees,
type="response")
y.hat.test<-ifelse(y.hat > 0.5,1,0)
y.hat.test
## [1] 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 1 0 1
y.test<-as.numeric(as.character(test[,1]))
y.test
## [1] 1 0 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1
## [36] 0 0 0 0 0 0 0 0 0 1 0 0
#Calculate mean squared error
mean((y.hat.test-y.test)^2)
## [1] 0.1914894
################################################################
#Try appling our model to unknown data
unkdat<-cleandat[cleandat$BAD1_DNA_present=="NA" | cleandat$BAD1_DNA_present=="ND",]
setwd("~/Documents/University_of_Minnesota/Nielsen_Lab/github/ecological-niche-modeling/")
source("src/clean_data.r")
unkdat<-clean_data(unkdat)
dim(unkdat)
## [1] 172 14
#BRT model applied
yhat.brt <- predict.gbm(brt3, newdata=unkdat[,2:ncol(unkdat)],
n.trees=brt3$gbm.call$best.trees,
type="response")
yhat.brt<-ifelse(yhat.brt>0.5,1,0)
sum(yhat.brt)/length(yhat.brt)
## [1] 0.5639535
################################################################
#Mapping our results
unkdat$BAD1_colors<-yhat.brt
unkdat$BAD1_colors<-ifelse(yhat.brt==1,"red","blue")
#Find and fix the outlier in Lake Superior
unkdat.n<-unkdat[!unkdat$Latitude==47.03755 & !unkdat$Longitude==-91.38154,]
mplot2<-ggmap(mp) +
geom_point(data = unkdat.n, aes(x=Longitude,y=Latitude,color=BAD1_colors))+
scale_colour_manual(name = 'Blastomyces present',values=c("blue","red"),
labels=c("N","Y"))
mplot2
