4 Developing the decision tree

Linde Berbers

2021-06-30

Install packages

#install.packages('sp')
#install.packages('raster')
#install.packages('rpart')
#install.packages('RColorBrewer')
#install.packages('rattle')
#install.packages('rpart.plot')
#install.packages('caret')
#install.packages('mlr')
#install.packages('DT')

Load packages

library(sp)
library(raster)
library(rpart)
library(RColorBrewer)
library(rattle)
#> Loading required package: tibble
#> Loading required package: bitops
#> Rattle: A free graphical interface for data science with R.
#> Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
#> Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(mlr)
#> Loading required package: ParamHelpers
#> 
#> Attaching package: 'ParamHelpers'
#> The following object is masked from 'package:raster':
#> 
#>     getValues
#> Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
#> Future development will only happen in 'mlr3'
#> (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
#> uncaught bugs meanwhile in {mlr} - please consider switching.
#> 
#> Attaching package: 'mlr'
#> The following object is masked from 'package:caret':
#> 
#>     train
#> The following object is masked from 'package:raster':
#> 
#>     resample
library(DT)

Load final seleciton of variable dataset

load("C:/Users/Linde/Desktop/Scriptie/Schrijven begin/Textuele conceptversie/Final Conceptversion MSc Linde Berbers/Supplementary Information/Supplementary Data 2. RStudio Data/Script Data/EurekaAll_Brick_df.RData")

Split the present-day dataframe in half to create a train and test dataset.

## 50% of the sample size
smp_size0.5_Eureka <- floor(0.5 * nrow(EurekaAll_Brick_df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind_Eureka <- sample(seq_len(nrow(EurekaAll_Brick_df)), size = smp_size0.5_Eureka)

train0.5_Eureka <- EurekaAll_Brick_df[train_ind_Eureka, ]
test0.5_Eureka <- EurekaAll_Brick_df[-train_ind_Eureka, ]

Make rpart model with the train dataset using ‘HigherThan1km’ as the response variable.

Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.0001)

Pruning the decision tree: Choose best cp by minimal xerror

printcp(Train0.5.rpart_Eureka)
#> 
#> Classification tree:
#> rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology + 
#>     Temp + DistTrans + DistFault + Faults + DistPlateBound + 
#>     Precip + DistTrench + DistVolc + Volc + Trans + Trench + 
#>     PlateBound, data = train0.5_Eureka, method = "class", cp = 1e-04)
#> 
#> Variables actually used in tree construction:
#> [1] DistFault      DistPlateBound DistTrans      DistTrench     DistVolc      
#> [6] GeoMean        Lithology      Precip         Temp          
#> 
#> Root node error: 2186/90000 = 0.024289
#> 
#> n= 90000 
#> 
#>            CP nsplit rel error  xerror      xstd
#> 1  0.74290942      0   1.00000 1.00000 0.0211269
#> 2  0.02927722      1   0.25709 0.26349 0.0109438
#> 3  0.01440988      3   0.19854 0.20311 0.0096154
#> 4  0.00411711      5   0.16972 0.17612 0.0089567
#> 5  0.00320220      6   0.16560 0.17521 0.0089335
#> 6  0.00251601      7   0.16240 0.17292 0.0088753
#> 7  0.00228728     11   0.15233 0.17246 0.0088636
#> 8  0.00182983     12   0.15005 0.17246 0.0088636
#> 9  0.00167734     15   0.14456 0.17338 0.0088870
#> 10 0.00160110     18   0.13952 0.17383 0.0088986
#> 11 0.00152486     20   0.13632 0.17429 0.0089103
#> 12 0.00137237     27   0.12489 0.16972 0.0087931
#> 13 0.00091491     33   0.11665 0.16377 0.0086383
#> 14 0.00045746     42   0.10750 0.16789 0.0087457
#> 15 0.00022873     52   0.10293 0.16972 0.0087931
#> 16 0.00010000     58   0.10156 0.17521 0.0089335
plotcp(Train0.5.rpart_Eureka)

Make new rpart model with train dataset + good cp

Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.00182983)

Remove response variable (‘HigherThan1km’) from test dataset for predicting

NewData_test0.5_Eureka <- test0.5_Eureka
NewData_test0.5_Eureka$HigherThan1km <- NULL
datatable(head(NewData_test0.5_Eureka,10),options = list(pageLength=10, pageWidth = 5), autoHideNavigation = TRUE)

Let the train model predict the response variable (‘HigherThan1km’) with the test dataset

Predict_0.5_Class_Eureka <- predict(Train0.5.rpart_Eureka, newdata = NewData_test0.5_Eureka, type = "class")
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)

ACCURACY TESTING

Compare the difference between actual and predicted

Test0.5_df_Eureka <- test0.5_Eureka['HigherThan1km']
#Test0.5_df_Eureka
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)
#Predict_0.5_Class_df_Eureka
table(Test0.5_df_Eureka)
#> Test0.5_df_Eureka
#>     0     1 
#> 87823  2177
table(Predict_0.5_Class_df_Eureka)
#> Predict_0.5_Class_df_Eureka
#>     0     1 
#> 87894  2106

Confusion Matrix

Calculating confusion matrix components manually

TP <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 1)
TP
#> [1] 1959
FP <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 1)
FP
#> [1] 147
FN <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 0)
FN
#> [1] 218
TN <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 0)
TN
#> [1] 87676

Calculating Confusion Matrix by funtion

confusionMatrix(factor(Test0.5_df_Eureka == 1), factor(Predict_0.5_Class_df_Eureka == 1), dnn = c("Actual", "Predicted"))
#> Confusion Matrix and Statistics
#> 
#>        Predicted
#> Actual  FALSE  TRUE
#>   FALSE 87676   147
#>   TRUE    218  1959
#>                                           
#>                Accuracy : 0.9959          
#>                  95% CI : (0.9955, 0.9963)
#>     No Information Rate : 0.9766          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9127          
#>                                           
#>  Mcnemar's Test P-Value : 0.0002483       
#>                                           
#>             Sensitivity : 0.9975          
#>             Specificity : 0.9302          
#>          Pos Pred Value : 0.9983          
#>          Neg Pred Value : 0.8999          
#>              Prevalence : 0.9766          
#>          Detection Rate : 0.9742          
#>    Detection Prevalence : 0.9758          
#>       Balanced Accuracy : 0.9639          
#>                                           
#>        'Positive' Class : FALSE           
#> 

Matthews Correlation Coefficient (MCC) MCC = ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

((TP*TN)-(FP*FN))
#> [1] 171725238
(TP+FP)
#> [1] 2106
(TP+FN)
#> [1] 2177
(TN+FP)
#> [1] 87823
(TN+FN)
#> [1] 87894
MCC <- 171725238/sqrt(2106*2177*87823*87894)
MCC 
#> [1] 0.9128344

Make rasters of actual and predicted to plot

Into raster Test0.5_df_Eureka

Get the rownames of data subsets

rownames_test_0.5_cp_RpartAll <- rownames(Test0.5_df_Eureka)
rownames_test_0.5_cp_numeric_RpartAll <- as.numeric(rownames_test_0.5_cp_RpartAll)

Get coordinates of each cell in dataframe

Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)

Get the coordinates of the predicted datasubsets

XCor_test_0.5_cp_RpartAll <- Coordinates_df$x[rownames_test_0.5_cp_numeric_RpartAll]
YCor_test_0.5_cp_RpartAll <- Coordinates_df$y[rownames_test_0.5_cp_numeric_RpartAll]

Get the coordinates into one file

XYCor_test_0.5_cp_RpartAll <- data.frame(XCor_test_0.5_cp_RpartAll, YCor_test_0.5_cp_RpartAll)

Get the coordinates and the 0s and 1s into one file.

XYZ_test_0.5_cp_RpartAll <- data.frame(XYCor_test_0.5_cp_RpartAll,Test0.5_df_Eureka)

To raster

RasterPlot_test_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_test_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")

Plot

plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")

To GeoTIFF

writeRaster(RasterPlot_test_0.5_cp_RpartAll, "RasterPlot_Test0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6

##Into raster Predict_0.5_Class_df_Eureka

Get the rownames of data subsets

rownames_pred_0.5_cp_RpartAll <- rownames(Predict_0.5_Class_df_Eureka)
rownames_pred_0.5_cp_numeric_RpartAll <- as.numeric(rownames_pred_0.5_cp_RpartAll)

Get coordinates of each cell in dataframe

Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)

Get the coordinates of the predicted datasubsets

XCor_pred_0.5_cp_RpartAll <- Coordinates_df$x[rownames_pred_0.5_cp_numeric_RpartAll]
YCor_pred_0.5_cp_RpartAll <- Coordinates_df$y[rownames_pred_0.5_cp_numeric_RpartAll]

Get the coordinates into one file.

XYCor_pred_0.5_cp_RpartAll <- data.frame(XCor_pred_0.5_cp_RpartAll, YCor_pred_0.5_cp_RpartAll)

Get the coordinates and the 0s and 1s into one file.

XYZ_pred_0.5_cp_RpartAll <- data.frame(XYCor_pred_0.5_cp_RpartAll,Predict_0.5_Class_df_Eureka)

To raster

RasterPlot_pred_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_pred_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")

Plot

#plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")

To GeoTIFF

writeRaster(RasterPlot_pred_0.5_cp_RpartAll, "RasterPlot_Pred0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6

Compare difference visually

Make rpart model with the 100% dataset using ‘HigherThan1km’ as the response variable.

Choose best cp

printcp(Rpart.EurekaAll)
#> 
#> Classification tree:
#> rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology + 
#>     Temp + DistTrans + DistFault + Faults + DistPlateBound + 
#>     Precip + DistTrench + DistVolc + Volc + Trans + Trench + 
#>     PlateBound, data = EurekaAll_Brick_df, method = "class", 
#>     cp = 1e-04)
#> 
#> Variables actually used in tree construction:
#> [1] DistFault      DistPlateBound DistTrans      DistTrench     DistVolc      
#> [6] GeoMean        Lithology      Precip         Temp          
#> 
#> Root node error: 4363/180000 = 0.024239
#> 
#> n= 180000 
#> 
#>            CP nsplit rel error  xerror      xstd
#> 1  0.73458629      0  1.000000 1.00000 0.0149547
#> 2  0.03082741      1  0.265414 0.26541 0.0077744
#> 3  0.01822141      3  0.203759 0.20811 0.0068891
#> 4  0.00389640      5  0.167316 0.17580 0.0063341
#> 5  0.00297960      6  0.163420 0.17488 0.0063176
#> 6  0.00229200      8  0.157460 0.16892 0.0062095
#> 7  0.00194820     10  0.152876 0.16617 0.0061590
#> 8  0.00183360     12  0.148980 0.16365 0.0061122
#> 9  0.00143250     14  0.145313 0.16204 0.0060823
#> 10 0.00137520     21  0.133624 0.15769 0.0060004
#> 11 0.00126060     24  0.129498 0.15723 0.0059917
#> 12 0.00114600     26  0.126977 0.15494 0.0059480
#> 13 0.00091680     30  0.121705 0.15081 0.0058686
#> 14 0.00080220     38  0.114371 0.14806 0.0058150
#> 15 0.00076400     40  0.112766 0.14806 0.0058150
#> 16 0.00075309     43  0.110474 0.14554 0.0057655
#> 17 0.00068760     52  0.103598 0.14554 0.0057655
#> 18 0.00061120     54  0.102223 0.14577 0.0057700
#> 19 0.00057300     57  0.100390 0.14692 0.0057925
#> 20 0.00045840     62  0.097410 0.14692 0.0057925
#> 21 0.00022920     66  0.095576 0.14829 0.0058195
#> 22 0.00015280     76  0.093055 0.15196 0.0058907
#> 23 0.00010000     79  0.092597 0.15494 0.0059480
plotcp(Rpart.EurekaAll)

Make new rpart model with good cp

Make figures of the decision tree