Install packages

#install.packages('sp')
#install.packages('raster')
#install.packages('rpart')
#install.packages('RColorBrewer')
#install.packages('rattle')
#install.packages('rpart.plot')
#install.packages('caret')
#install.packages('mlr')
#install.packages('DT')

Load packages

library(sp)
library(raster)
library(rpart)
library(RColorBrewer)
library(rattle)

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(mlr)

## Loading required package: ParamHelpers

## 
## Attaching package: 'ParamHelpers'

## The following object is masked from 'package:raster':
## 
##     getValues

## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.

## 
## Attaching package: 'mlr'

## The following object is masked from 'package:caret':
## 
##     train

## The following object is masked from 'package:raster':
## 
##     resample

library(DT)

Load final seleciton of variable dataset

load("C:/Users/Linde/Desktop/Scriptie/Schrijven begin/Textuele conceptversie/Final Conceptversion MSc Linde Berbers/Supplementary Information/Supplementary Data 2. RStudio Data/Script Data/EurekaAll_Brick_df.RData")

Split the present-day dataframe in half to create a train and test dataset.

## 50% of the sample size
smp_size0.5_Eureka <- floor(0.5 * nrow(EurekaAll_Brick_df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind_Eureka <- sample(seq_len(nrow(EurekaAll_Brick_df)), size = smp_size0.5_Eureka)

train0.5_Eureka <- EurekaAll_Brick_df[train_ind_Eureka, ]
test0.5_Eureka <- EurekaAll_Brick_df[-train_ind_Eureka, ]

Make rpart model with the train dataset using ‘HigherThan1km’ as the response variable.

Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.0001)

Pruning the decision tree: Choose best cp by minimal xerror

printcp(Train0.5.rpart_Eureka)

## 
## Classification tree:
## rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology + 
##     Temp + DistTrans + DistFault + Faults + DistPlateBound + 
##     Precip + DistTrench + DistVolc + Volc + Trans + Trench + 
##     PlateBound, data = train0.5_Eureka, method = "class", cp = 1e-04)
## 
## Variables actually used in tree construction:
## [1] DistFault      DistPlateBound DistTrans      DistTrench     DistVolc      
## [6] GeoMean        Lithology      Precip         Temp          
## 
## Root node error: 2186/90000 = 0.024289
## 
## n= 90000 
## 
##            CP nsplit rel error  xerror      xstd
## 1  0.74290942      0   1.00000 1.00000 0.0211269
## 2  0.02927722      1   0.25709 0.26349 0.0109438
## 3  0.01440988      3   0.19854 0.20311 0.0096154
## 4  0.00411711      5   0.16972 0.17612 0.0089567
## 5  0.00320220      6   0.16560 0.17521 0.0089335
## 6  0.00251601      7   0.16240 0.17292 0.0088753
## 7  0.00228728     11   0.15233 0.17246 0.0088636
## 8  0.00182983     12   0.15005 0.17246 0.0088636
## 9  0.00167734     15   0.14456 0.17338 0.0088870
## 10 0.00160110     18   0.13952 0.17383 0.0088986
## 11 0.00152486     20   0.13632 0.17429 0.0089103
## 12 0.00137237     27   0.12489 0.16972 0.0087931
## 13 0.00091491     33   0.11665 0.16377 0.0086383
## 14 0.00045746     42   0.10750 0.16789 0.0087457
## 15 0.00022873     52   0.10293 0.16972 0.0087931
## 16 0.00010000     58   0.10156 0.17521 0.0089335

plotcp(Train0.5.rpart_Eureka)

Make new rpart model with train dataset + good cp

Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.00182983)

Remove response variable (‘HigherThan1km’) from test dataset for predicting

NewData_test0.5_Eureka <- test0.5_Eureka
NewData_test0.5_Eureka$HigherThan1km <- NULL
datatable(head(NewData_test0.5_Eureka,10),options = list(pageLength=10, pageWidth = 5), autoHideNavigation = TRUE)

Let the train model predict the response variable (‘HigherThan1km’) with the test dataset

Predict_0.5_Class_Eureka <- predict(Train0.5.rpart_Eureka, newdata = NewData_test0.5_Eureka, type = "class")
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)

ACCURACY TESTING

Compare the difference between actual and predicted

Test0.5_df_Eureka <- test0.5_Eureka['HigherThan1km']
#Test0.5_df_Eureka
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)
#Predict_0.5_Class_df_Eureka
table(Test0.5_df_Eureka)

## Test0.5_df_Eureka
##     0     1 
## 87823  2177

table(Predict_0.5_Class_df_Eureka)

## Predict_0.5_Class_df_Eureka
##     0     1 
## 87894  2106

Confusion Matrix

Calculating confusion matrix components manually

TP <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 1)
TP

## [1] 1959

FP <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 1)
FP

## [1] 147

FN <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 0)
FN

## [1] 218

TN <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 0)
TN

## [1] 87676

Calculating Confusion Matrix by funtion

confusionMatrix(factor(Test0.5_df_Eureka == 1), factor(Predict_0.5_Class_df_Eureka == 1), dnn = c("Actual", "Predicted"))

## Confusion Matrix and Statistics
## 
##        Predicted
## Actual  FALSE  TRUE
##   FALSE 87676   147
##   TRUE    218  1959
##                                           
##                Accuracy : 0.9959          
##                  95% CI : (0.9955, 0.9963)
##     No Information Rate : 0.9766          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9127          
##                                           
##  Mcnemar's Test P-Value : 0.0002483       
##                                           
##             Sensitivity : 0.9975          
##             Specificity : 0.9302          
##          Pos Pred Value : 0.9983          
##          Neg Pred Value : 0.8999          
##              Prevalence : 0.9766          
##          Detection Rate : 0.9742          
##    Detection Prevalence : 0.9758          
##       Balanced Accuracy : 0.9639          
##                                           
##        'Positive' Class : FALSE           
##

Matthews Correlation Coefficient (MCC) MCC = ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

((TP*TN)-(FP*FN))

## [1] 171725238

(TP+FP)

## [1] 2106

(TP+FN)

## [1] 2177

(TN+FP)

## [1] 87823

(TN+FN)

## [1] 87894

MCC <- 171725238/sqrt(2106*2177*87823*87894)
MCC

## [1] 0.9128344

Make rasters of actual and predicted to plot

Into raster Test0.5_df_Eureka

Get the rownames of data subsets

rownames_test_0.5_cp_RpartAll <- rownames(Test0.5_df_Eureka)
rownames_test_0.5_cp_numeric_RpartAll <- as.numeric(rownames_test_0.5_cp_RpartAll)

Get coordinates of each cell in dataframe

Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)

Get the coordinates of the predicted datasubsets

XCor_test_0.5_cp_RpartAll <- Coordinates_df$x[rownames_test_0.5_cp_numeric_RpartAll]
YCor_test_0.5_cp_RpartAll <- Coordinates_df$y[rownames_test_0.5_cp_numeric_RpartAll]

Get the coordinates into one file

XYCor_test_0.5_cp_RpartAll <- data.frame(XCor_test_0.5_cp_RpartAll, YCor_test_0.5_cp_RpartAll)

Get the coordinates and the 0s and 1s into one file.

XYZ_test_0.5_cp_RpartAll <- data.frame(XYCor_test_0.5_cp_RpartAll,Test0.5_df_Eureka)

To raster

RasterPlot_test_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_test_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")

Plot

plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")

To GeoTIFF

writeRaster(RasterPlot_test_0.5_cp_RpartAll, "RasterPlot_Test0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)

## Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6

##Into raster Predict_0.5_Class_df_Eureka

Get the rownames of data subsets

rownames_pred_0.5_cp_RpartAll <- rownames(Predict_0.5_Class_df_Eureka)
rownames_pred_0.5_cp_numeric_RpartAll <- as.numeric(rownames_pred_0.5_cp_RpartAll)

Get coordinates of each cell in dataframe

Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)

Get the coordinates of the predicted datasubsets

XCor_pred_0.5_cp_RpartAll <- Coordinates_df$x[rownames_pred_0.5_cp_numeric_RpartAll]
YCor_pred_0.5_cp_RpartAll <- Coordinates_df$y[rownames_pred_0.5_cp_numeric_RpartAll]

Get the coordinates into one file.

XYCor_pred_0.5_cp_RpartAll <- data.frame(XCor_pred_0.5_cp_RpartAll, YCor_pred_0.5_cp_RpartAll)

Get the coordinates and the 0s and 1s into one file.

XYZ_pred_0.5_cp_RpartAll <- data.frame(XYCor_pred_0.5_cp_RpartAll,Predict_0.5_Class_df_Eureka)

To raster

RasterPlot_pred_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_pred_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")

Plot

#plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")

To GeoTIFF

writeRaster(RasterPlot_pred_0.5_cp_RpartAll, "RasterPlot_Pred0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)

## Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6

Compare difference visually

plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")
plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")

Make rpart model with the 100% dataset using ‘HigherThan1km’ as the response variable.

Rpart.EurekaAll <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=EurekaAll_Brick_df, method="class", cp=0.0001)

Choose best cp

printcp(Rpart.EurekaAll)

## 
## Classification tree:
## rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology + 
##     Temp + DistTrans + DistFault + Faults + DistPlateBound + 
##     Precip + DistTrench + DistVolc + Volc + Trans + Trench + 
##     PlateBound, data = EurekaAll_Brick_df, method = "class", 
##     cp = 1e-04)
## 
## Variables actually used in tree construction:
## [1] DistFault      DistPlateBound DistTrans      DistTrench     DistVolc      
## [6] GeoMean        Lithology      Precip         Temp          
## 
## Root node error: 4363/180000 = 0.024239
## 
## n= 180000 
## 
##            CP nsplit rel error  xerror      xstd
## 1  0.73458629      0  1.000000 1.00000 0.0149547
## 2  0.03082741      1  0.265414 0.26541 0.0077744
## 3  0.01822141      3  0.203759 0.20811 0.0068891
## 4  0.00389640      5  0.167316 0.17580 0.0063341
## 5  0.00297960      6  0.163420 0.17488 0.0063176
## 6  0.00229200      8  0.157460 0.16892 0.0062095
## 7  0.00194820     10  0.152876 0.16617 0.0061590
## 8  0.00183360     12  0.148980 0.16365 0.0061122
## 9  0.00143250     14  0.145313 0.16204 0.0060823
## 10 0.00137520     21  0.133624 0.15769 0.0060004
## 11 0.00126060     24  0.129498 0.15723 0.0059917
## 12 0.00114600     26  0.126977 0.15494 0.0059480
## 13 0.00091680     30  0.121705 0.15081 0.0058686
## 14 0.00080220     38  0.114371 0.14806 0.0058150
## 15 0.00076400     40  0.112766 0.14806 0.0058150
## 16 0.00075309     43  0.110474 0.14554 0.0057655
## 17 0.00068760     52  0.103598 0.14554 0.0057655
## 18 0.00061120     54  0.102223 0.14577 0.0057700
## 19 0.00057300     57  0.100390 0.14692 0.0057925
## 20 0.00045840     62  0.097410 0.14692 0.0057925
## 21 0.00022920     66  0.095576 0.14829 0.0058195
## 22 0.00015280     76  0.093055 0.15196 0.0058907
## 23 0.00010000     79  0.092597 0.15494 0.0059480

plotcp(Rpart.EurekaAll)

Make new rpart model with good cp

Rpart.EurekaAll.Final <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=EurekaAll_Brick_df, method="class", cp=0.00080220)

Make figures of the decision tree

rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
    under = FALSE, fallen.leaves = TRUE,
    digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
    cex = NULL, tweak = 1.1,
    clip.facs = FALSE, clip.right.labs = TRUE,
    snip = FALSE,
    box.palette = c("BnGn"), shadow.col = 0)

fancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")

pdf('rpartplot10BrownGreen3')
rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
    under = FALSE, fallen.leaves = TRUE,
    digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
    cex = NULL, tweak = 1.1,
    clip.facs = FALSE, clip.right.labs = TRUE,
    snip = FALSE,
    box.palette = c("BnGn"), shadow.col = 0)
dev.off()

## png 
##   2

pdf('FancyTryRedsGreenBlack3')
fancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")
dev.off()

## png 
##   2

4 Developing the decision tree

Linde Berbers

2021-06-30