Install packages
#install.packages('sp')
#install.packages('raster')
#install.packages('rpart')
#install.packages('RColorBrewer')
#install.packages('rattle')
#install.packages('rpart.plot')
#install.packages('caret')
#install.packages('mlr')
#install.packages('DT')
Load packages
library(sp)
library(raster)
library(rpart)
library(RColorBrewer)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(mlr)
## Loading required package: ParamHelpers
##
## Attaching package: 'ParamHelpers'
## The following object is masked from 'package:raster':
##
## getValues
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
##
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
##
## train
## The following object is masked from 'package:raster':
##
## resample
library(DT)
Load final seleciton of variable dataset
load("C:/Users/Linde/Desktop/Scriptie/Schrijven begin/Textuele conceptversie/Final Conceptversion MSc Linde Berbers/Supplementary Information/Supplementary Data 2. RStudio Data/Script Data/EurekaAll_Brick_df.RData")
## 50% of the sample size
smp_size0.5_Eureka <- floor(0.5 * nrow(EurekaAll_Brick_df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind_Eureka <- sample(seq_len(nrow(EurekaAll_Brick_df)), size = smp_size0.5_Eureka)
train0.5_Eureka <- EurekaAll_Brick_df[train_ind_Eureka, ]
test0.5_Eureka <- EurekaAll_Brick_df[-train_ind_Eureka, ]
Make rpart model with the train dataset using ‘HigherThan1km’ as the response variable.
Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.0001)
Pruning the decision tree: Choose best cp by minimal xerror
printcp(Train0.5.rpart_Eureka)
##
## Classification tree:
## rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology +
## Temp + DistTrans + DistFault + Faults + DistPlateBound +
## Precip + DistTrench + DistVolc + Volc + Trans + Trench +
## PlateBound, data = train0.5_Eureka, method = "class", cp = 1e-04)
##
## Variables actually used in tree construction:
## [1] DistFault DistPlateBound DistTrans DistTrench DistVolc
## [6] GeoMean Lithology Precip Temp
##
## Root node error: 2186/90000 = 0.024289
##
## n= 90000
##
## CP nsplit rel error xerror xstd
## 1 0.74290942 0 1.00000 1.00000 0.0211269
## 2 0.02927722 1 0.25709 0.26349 0.0109438
## 3 0.01440988 3 0.19854 0.20311 0.0096154
## 4 0.00411711 5 0.16972 0.17612 0.0089567
## 5 0.00320220 6 0.16560 0.17521 0.0089335
## 6 0.00251601 7 0.16240 0.17292 0.0088753
## 7 0.00228728 11 0.15233 0.17246 0.0088636
## 8 0.00182983 12 0.15005 0.17246 0.0088636
## 9 0.00167734 15 0.14456 0.17338 0.0088870
## 10 0.00160110 18 0.13952 0.17383 0.0088986
## 11 0.00152486 20 0.13632 0.17429 0.0089103
## 12 0.00137237 27 0.12489 0.16972 0.0087931
## 13 0.00091491 33 0.11665 0.16377 0.0086383
## 14 0.00045746 42 0.10750 0.16789 0.0087457
## 15 0.00022873 52 0.10293 0.16972 0.0087931
## 16 0.00010000 58 0.10156 0.17521 0.0089335
plotcp(Train0.5.rpart_Eureka)
Make new rpart model with train dataset + good cp
Train0.5.rpart_Eureka <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=train0.5_Eureka, method="class", cp=0.00182983)
Remove response variable (‘HigherThan1km’) from test dataset for predicting
NewData_test0.5_Eureka <- test0.5_Eureka
NewData_test0.5_Eureka$HigherThan1km <- NULL
datatable(head(NewData_test0.5_Eureka,10),options = list(pageLength=10, pageWidth = 5), autoHideNavigation = TRUE)
Let the train model predict the response variable (‘HigherThan1km’) with the test dataset
Predict_0.5_Class_Eureka <- predict(Train0.5.rpart_Eureka, newdata = NewData_test0.5_Eureka, type = "class")
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)
Compare the difference between actual and predicted
Test0.5_df_Eureka <- test0.5_Eureka['HigherThan1km']
#Test0.5_df_Eureka
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)
#Predict_0.5_Class_df_Eureka
table(Test0.5_df_Eureka)
## Test0.5_df_Eureka
## 0 1
## 87823 2177
table(Predict_0.5_Class_df_Eureka)
## Predict_0.5_Class_df_Eureka
## 0 1
## 87894 2106
Calculating confusion matrix components manually
TP <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 1)
TP
## [1] 1959
FP <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 1)
FP
## [1] 147
FN <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 0)
FN
## [1] 218
TN <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 0)
TN
## [1] 87676
Calculating Confusion Matrix by funtion
confusionMatrix(factor(Test0.5_df_Eureka == 1), factor(Predict_0.5_Class_df_Eureka == 1), dnn = c("Actual", "Predicted"))
## Confusion Matrix and Statistics
##
## Predicted
## Actual FALSE TRUE
## FALSE 87676 147
## TRUE 218 1959
##
## Accuracy : 0.9959
## 95% CI : (0.9955, 0.9963)
## No Information Rate : 0.9766
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9127
##
## Mcnemar's Test P-Value : 0.0002483
##
## Sensitivity : 0.9975
## Specificity : 0.9302
## Pos Pred Value : 0.9983
## Neg Pred Value : 0.8999
## Prevalence : 0.9766
## Detection Rate : 0.9742
## Detection Prevalence : 0.9758
## Balanced Accuracy : 0.9639
##
## 'Positive' Class : FALSE
##
Matthews Correlation Coefficient (MCC) MCC = ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
((TP*TN)-(FP*FN))
## [1] 171725238
(TP+FP)
## [1] 2106
(TP+FN)
## [1] 2177
(TN+FP)
## [1] 87823
(TN+FN)
## [1] 87894
MCC <- 171725238/sqrt(2106*2177*87823*87894)
MCC
## [1] 0.9128344
Get the rownames of data subsets
rownames_test_0.5_cp_RpartAll <- rownames(Test0.5_df_Eureka)
rownames_test_0.5_cp_numeric_RpartAll <- as.numeric(rownames_test_0.5_cp_RpartAll)
Get coordinates of each cell in dataframe
Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)
Get the coordinates of the predicted datasubsets
XCor_test_0.5_cp_RpartAll <- Coordinates_df$x[rownames_test_0.5_cp_numeric_RpartAll]
YCor_test_0.5_cp_RpartAll <- Coordinates_df$y[rownames_test_0.5_cp_numeric_RpartAll]
Get the coordinates into one file
XYCor_test_0.5_cp_RpartAll <- data.frame(XCor_test_0.5_cp_RpartAll, YCor_test_0.5_cp_RpartAll)
Get the coordinates and the 0s and 1s into one file.
XYZ_test_0.5_cp_RpartAll <- data.frame(XYCor_test_0.5_cp_RpartAll,Test0.5_df_Eureka)
To raster
RasterPlot_test_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_test_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")
Plot
plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")
To GeoTIFF
writeRaster(RasterPlot_test_0.5_cp_RpartAll, "RasterPlot_Test0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
## Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6
##Into raster Predict_0.5_Class_df_Eureka
Get the rownames of data subsets
rownames_pred_0.5_cp_RpartAll <- rownames(Predict_0.5_Class_df_Eureka)
rownames_pred_0.5_cp_numeric_RpartAll <- as.numeric(rownames_pred_0.5_cp_RpartAll)
Get coordinates of each cell in dataframe
Coordinates <- coordinates(EurekaAll_Brick)
Coordinates_df <- as.data.frame(Coordinates)
Get the coordinates of the predicted datasubsets
XCor_pred_0.5_cp_RpartAll <- Coordinates_df$x[rownames_pred_0.5_cp_numeric_RpartAll]
YCor_pred_0.5_cp_RpartAll <- Coordinates_df$y[rownames_pred_0.5_cp_numeric_RpartAll]
Get the coordinates into one file.
XYCor_pred_0.5_cp_RpartAll <- data.frame(XCor_pred_0.5_cp_RpartAll, YCor_pred_0.5_cp_RpartAll)
Get the coordinates and the 0s and 1s into one file.
XYZ_pred_0.5_cp_RpartAll <- data.frame(XYCor_pred_0.5_cp_RpartAll,Predict_0.5_Class_df_Eureka)
To raster
RasterPlot_pred_0.5_cp_RpartAll <- rasterFromXYZ(XYZ_pred_0.5_cp_RpartAll, res=c(0.1,0.1), crs="+proj=longlat +datum=WGS84 +no_defs ")
Plot
#plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")
To GeoTIFF
writeRaster(RasterPlot_pred_0.5_cp_RpartAll, "RasterPlot_Pred0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
## Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6
plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")
plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")
Rpart.EurekaAll <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=EurekaAll_Brick_df, method="class", cp=0.0001)
Choose best cp
printcp(Rpart.EurekaAll)
##
## Classification tree:
## rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology +
## Temp + DistTrans + DistFault + Faults + DistPlateBound +
## Precip + DistTrench + DistVolc + Volc + Trans + Trench +
## PlateBound, data = EurekaAll_Brick_df, method = "class",
## cp = 1e-04)
##
## Variables actually used in tree construction:
## [1] DistFault DistPlateBound DistTrans DistTrench DistVolc
## [6] GeoMean Lithology Precip Temp
##
## Root node error: 4363/180000 = 0.024239
##
## n= 180000
##
## CP nsplit rel error xerror xstd
## 1 0.73458629 0 1.000000 1.00000 0.0149547
## 2 0.03082741 1 0.265414 0.26541 0.0077744
## 3 0.01822141 3 0.203759 0.20811 0.0068891
## 4 0.00389640 5 0.167316 0.17580 0.0063341
## 5 0.00297960 6 0.163420 0.17488 0.0063176
## 6 0.00229200 8 0.157460 0.16892 0.0062095
## 7 0.00194820 10 0.152876 0.16617 0.0061590
## 8 0.00183360 12 0.148980 0.16365 0.0061122
## 9 0.00143250 14 0.145313 0.16204 0.0060823
## 10 0.00137520 21 0.133624 0.15769 0.0060004
## 11 0.00126060 24 0.129498 0.15723 0.0059917
## 12 0.00114600 26 0.126977 0.15494 0.0059480
## 13 0.00091680 30 0.121705 0.15081 0.0058686
## 14 0.00080220 38 0.114371 0.14806 0.0058150
## 15 0.00076400 40 0.112766 0.14806 0.0058150
## 16 0.00075309 43 0.110474 0.14554 0.0057655
## 17 0.00068760 52 0.103598 0.14554 0.0057655
## 18 0.00061120 54 0.102223 0.14577 0.0057700
## 19 0.00057300 57 0.100390 0.14692 0.0057925
## 20 0.00045840 62 0.097410 0.14692 0.0057925
## 21 0.00022920 66 0.095576 0.14829 0.0058195
## 22 0.00015280 76 0.093055 0.15196 0.0058907
## 23 0.00010000 79 0.092597 0.15494 0.0059480
plotcp(Rpart.EurekaAll)
Make new rpart model with good cp
Rpart.EurekaAll.Final <- rpart(HigherThan1km ~ Land + GeoMean + Lithology + Temp + DistTrans + DistFault + Faults + DistPlateBound + Precip + DistTrench + DistVolc + Volc + Trans + Trench + PlateBound, data=EurekaAll_Brick_df, method="class", cp=0.00080220)
rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
under = FALSE, fallen.leaves = TRUE,
digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
cex = NULL, tweak = 1.1,
clip.facs = FALSE, clip.right.labs = TRUE,
snip = FALSE,
box.palette = c("BnGn"), shadow.col = 0)
fancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")
pdf('rpartplot10BrownGreen3')
rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
under = FALSE, fallen.leaves = TRUE,
digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
cex = NULL, tweak = 1.1,
clip.facs = FALSE, clip.right.labs = TRUE,
snip = FALSE,
box.palette = c("BnGn"), shadow.col = 0)
dev.off()
## png
## 2
pdf('FancyTryRedsGreenBlack3')
fancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")
dev.off()
## png
## 2