#install.packages('sp') #install.packages('raster') #install.packages('rpart') #install.packages('RColorBrewer') #install.packages('rattle') #install.packages('rpart.plot') #install.packages('caret') #install.packages('mlr') #install.packages('DT')

library(sp) library(raster) library(rpart) library(RColorBrewer) library(rattle) #> Loading required package: tibble #> Loading required package: bitops #> Rattle: A free graphical interface for data science with R. #> Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd. #> Type 'rattle()' to shake, rattle, and roll your data. library(rpart.plot) library(caret) #> Loading required package: lattice #> Loading required package: ggplot2 library(mlr) #> Loading required package: ParamHelpers #> #> Attaching package: 'ParamHelpers' #> The following object is masked from 'package:raster': #> #> getValues #> Warning message: 'mlr' is in 'maintenance-only' mode since July 2019. #> Future development will only happen in 'mlr3' #> (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be #> uncaught bugs meanwhile in {mlr} - please consider switching. #> #> Attaching package: 'mlr' #> The following object is masked from 'package:caret': #> #> train #> The following object is masked from 'package:raster': #> #> resample library(DT)

load("C:/Users/Linde/Desktop/Scriptie/Schrijven begin/Textuele conceptversie/Final Conceptversion MSc Linde Berbers/Supplementary Information/Supplementary Data 2. RStudio Data/Script Data/EurekaAll_Brick_df.RData")

## 50% of the sample size smp_size0.5_Eureka <- floor(0.5 * nrow(EurekaAll_Brick_df)) ## set the seed to make your partition reproducible set.seed(123) train_ind_Eureka <- sample(seq_len(nrow(EurekaAll_Brick_df)), size = smp_size0.5_Eureka) train0.5_Eureka <- EurekaAll_Brick_df[train_ind_Eureka, ] test0.5_Eureka <- EurekaAll_Brick_df[-train_ind_Eureka, ]

Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.0001)

printcp(Train0.5.rpart_Eureka) #> #> Classification tree: #> rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology + #> Temp + DistTrans + DistFault + Faults + DistPlateBound + #> Precip + DistTrench + DistVolc + Volc + Trans + Trench + #> PlateBound, data = train0.5_Eureka, method = "class", cp = 1e-04) #> #> Variables actually used in tree construction: #> [1] DistFault DistPlateBound DistTrans DistTrench DistVolc #> [6] GeoMean Lithology Precip Temp #> #> Root node error: 2186/90000 = 0.024289 #> #> n= 90000 #> #> CP nsplit rel error xerror xstd #> 1 0.74290942 0 1.00000 1.00000 0.0211269 #> 2 0.02927722 1 0.25709 0.26349 0.0109438 #> 3 0.01440988 3 0.19854 0.20311 0.0096154 #> 4 0.00411711 5 0.16972 0.17612 0.0089567 #> 5 0.00320220 6 0.16560 0.17521 0.0089335 #> 6 0.00251601 7 0.16240 0.17292 0.0088753 #> 7 0.00228728 11 0.15233 0.17246 0.0088636 #> 8 0.00182983 12 0.15005 0.17246 0.0088636 #> 9 0.00167734 15 0.14456 0.17338 0.0088870 #> 10 0.00160110 18 0.13952 0.17383 0.0088986 #> 11 0.00152486 20 0.13632 0.17429 0.0089103 #> 12 0.00137237 27 0.12489 0.16972 0.0087931 #> 13 0.00091491 33 0.11665 0.16377 0.0086383 #> 14 0.00045746 42 0.10750 0.16789 0.0087457 #> 15 0.00022873 52 0.10293 0.16972 0.0087931 #> 16 0.00010000 58 0.10156 0.17521 0.0089335 plotcp(Train0.5.rpart_Eureka)

NewData_test0.5_Eureka <- test0.5_Eureka NewData_test0.5_Eureka$HigherThan1km <- NULL datatable(head(NewData_test0.5_Eureka,10),options = list(pageLength=10, pageWidth = 5), autoHideNavigation = TRUE)

Test0.5_df_Eureka <- test0.5_Eureka['HigherThan1km'] #Test0.5_df_Eureka Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka) #Predict_0.5_Class_df_Eureka table(Test0.5_df_Eureka) #> Test0.5_df_Eureka #> 0 1 #> 87823 2177 table(Predict_0.5_Class_df_Eureka) #> Predict_0.5_Class_df_Eureka #> 0 1 #> 87894 2106

TP <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 1) TP #> [1] 1959 FP <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 1) FP #> [1] 147 FN <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 0) FN #> [1] 218 TN <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 0) TN #> [1] 87676

confusionMatrix(factor(Test0.5_df_Eureka == 1), factor(Predict_0.5_Class_df_Eureka == 1), dnn = c("Actual", "Predicted")) #> Confusion Matrix and Statistics #> #> Predicted #> Actual FALSE TRUE #> FALSE 87676 147 #> TRUE 218 1959 #> #> Accuracy : 0.9959 #> 95% CI : (0.9955, 0.9963) #> No Information Rate : 0.9766 #> P-Value [Acc > NIR] : < 2.2e-16 #> #> Kappa : 0.9127 #> #> Mcnemar's Test P-Value : 0.0002483 #> #> Sensitivity : 0.9975 #> Specificity : 0.9302 #> Pos Pred Value : 0.9983 #> Neg Pred Value : 0.8999 #> Prevalence : 0.9766 #> Detection Rate : 0.9742 #> Detection Prevalence : 0.9758 #> Balanced Accuracy : 0.9639 #> #> 'Positive' Class : FALSE #>

Into raster Test0.5_df_Eureka

Get the rownames of data subsets

rownames_test_0.5_cp_RpartAll <- rownames(Test0.5_df_Eureka)
rownames_test_0.5_cp_numeric_RpartAll <- as.numeric(rownames_test_0.5_cp_RpartAll)

Get coordinates of each cell in dataframe

Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)

Get the coordinates of the predicted datasubsets

XCor_test_0.5_cp_RpartAll <- Coordinates_df$x[rownames_test_0.5_cp_numeric_RpartAll]
YCor_test_0.5_cp_RpartAll <- Coordinates_df$y[rownames_test_0.5_cp_numeric_RpartAll]

Get the coordinates into one file

XYCor_test_0.5_cp_RpartAll <- data.frame(XCor_test_0.5_cp_RpartAll, YCor_test_0.5_cp_RpartAll)

Get the coordinates and the 0s and 1s into one file.

XYZ_test_0.5_cp_RpartAll <- data.frame(XYCor_test_0.5_cp_RpartAll,Test0.5_df_Eureka)

To raster

RasterPlot_test_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_test_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")

Plot

plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")

To GeoTIFF

writeRaster(RasterPlot_test_0.5_cp_RpartAll, "RasterPlot_Test0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6

##Into raster Predict_0.5_Class_df_Eureka

Get the rownames of data subsets

rownames_pred_0.5_cp_RpartAll <- rownames(Predict_0.5_Class_df_Eureka)
rownames_pred_0.5_cp_numeric_RpartAll <- as.numeric(rownames_pred_0.5_cp_RpartAll)

Get coordinates of each cell in dataframe

Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)

Get the coordinates of the predicted datasubsets

XCor_pred_0.5_cp_RpartAll <- Coordinates_df$x[rownames_pred_0.5_cp_numeric_RpartAll]
YCor_pred_0.5_cp_RpartAll <- Coordinates_df$y[rownames_pred_0.5_cp_numeric_RpartAll]

Get the coordinates into one file.

XYCor_pred_0.5_cp_RpartAll <- data.frame(XCor_pred_0.5_cp_RpartAll, YCor_pred_0.5_cp_RpartAll)

Get the coordinates and the 0s and 1s into one file.

XYZ_pred_0.5_cp_RpartAll <- data.frame(XYCor_pred_0.5_cp_RpartAll,Predict_0.5_Class_df_Eureka)

To raster

RasterPlot_pred_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_pred_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")

Plot

#plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")

To GeoTIFF

writeRaster(RasterPlot_pred_0.5_cp_RpartAll, "RasterPlot_Pred0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6

Compare difference visually

plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")
plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")

Make rpart model with the 100% dataset using ‘HigherThan1km’ as the response variable.

Rpart.EurekaAll <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=EurekaAll_Brick_df, method="class", cp=0.0001)

Choose best cp

printcp(Rpart.EurekaAll)
#> 
#> Classification tree:
#> rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology + 
#>     Temp + DistTrans + DistFault + Faults + DistPlateBound + 
#>     Precip + DistTrench + DistVolc + Volc + Trans + Trench + 
#>     PlateBound, data = EurekaAll_Brick_df, method = "class", 
#>     cp = 1e-04)
#> 
#> Variables actually used in tree construction:
#> [1] DistFault      DistPlateBound DistTrans      DistTrench     DistVolc      
#> [6] GeoMean        Lithology      Precip         Temp          
#> 
#> Root node error: 4363/180000 = 0.024239
#> 
#> n= 180000 
#> 
#>            CP nsplit rel error  xerror      xstd
#> 1  0.73458629      0  1.000000 1.00000 0.0149547
#> 2  0.03082741      1  0.265414 0.26541 0.0077744
#> 3  0.01822141      3  0.203759 0.20811 0.0068891
#> 4  0.00389640      5  0.167316 0.17580 0.0063341
#> 5  0.00297960      6  0.163420 0.17488 0.0063176
#> 6  0.00229200      8  0.157460 0.16892 0.0062095
#> 7  0.00194820     10  0.152876 0.16617 0.0061590
#> 8  0.00183360     12  0.148980 0.16365 0.0061122
#> 9  0.00143250     14  0.145313 0.16204 0.0060823
#> 10 0.00137520     21  0.133624 0.15769 0.0060004
#> 11 0.00126060     24  0.129498 0.15723 0.0059917
#> 12 0.00114600     26  0.126977 0.15494 0.0059480
#> 13 0.00091680     30  0.121705 0.15081 0.0058686
#> 14 0.00080220     38  0.114371 0.14806 0.0058150
#> 15 0.00076400     40  0.112766 0.14806 0.0058150
#> 16 0.00075309     43  0.110474 0.14554 0.0057655
#> 17 0.00068760     52  0.103598 0.14554 0.0057655
#> 18 0.00061120     54  0.102223 0.14577 0.0057700
#> 19 0.00057300     57  0.100390 0.14692 0.0057925
#> 20 0.00045840     62  0.097410 0.14692 0.0057925
#> 21 0.00022920     66  0.095576 0.14829 0.0058195
#> 22 0.00015280     76  0.093055 0.15196 0.0058907
#> 23 0.00010000     79  0.092597 0.15494 0.0059480
plotcp(Rpart.EurekaAll)

Make new rpart model with good cp

Rpart.EurekaAll.Final <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=EurekaAll_Brick_df, method="class", cp=0.00080220)

Make figures of the decision tree

rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
    under = FALSE, fallen.leaves = TRUE,
    digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
    cex = NULL, tweak = 1.1,
    clip.facs = FALSE, clip.right.labs = TRUE,
    snip = FALSE,
    box.palette = c("BnGn"), shadow.col = 0)
#> Warning: labs do not fit even at cex 0.15, there may be some overplotting

fancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")
#> Warning: labs do not fit even at cex 0.15, there may be some overplotting

pdf('rpartplot10BrownGreen3')
rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
    under = FALSE, fallen.leaves = TRUE,
    digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
    cex = NULL, tweak = 1.1,
    clip.facs = FALSE, clip.right.labs = TRUE,
    snip = FALSE,
    box.palette = c("BnGn"), shadow.col = 0)
dev.off()
#> png 
#>   2

pdf('FancyTryRedsGreenBlack3')
fancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")
dev.off()
#> png 
#>   2

4 Developing the decision tree

Linde Berbers

2021-06-30

Split the present-day dataframe in half to create a train and test dataset.

ACCURACY TESTING

Confusion Matrix

Make rasters of actual and predicted to plot

Into raster Test0.5_df_Eureka

Compare difference visually

Make rpart model with the 100% dataset using ‘HigherThan1km’ as the response variable.

Make figures of the decision tree