Install packages
#install.packages('sp')
#install.packages('raster')
#install.packages('rpart')
#install.packages('RColorBrewer')
#install.packages('rattle')
#install.packages('rpart.plot')
#install.packages('caret')
#install.packages('mlr')
#install.packages('DT')Load packages
library(sp)
library(raster)
library(rpart)
library(RColorBrewer)
library(rattle)
#> Loading required package: tibble
#> Loading required package: bitops
#> Rattle: A free graphical interface for data science with R.
#> Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
#> Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(mlr)
#> Loading required package: ParamHelpers
#>
#> Attaching package: 'ParamHelpers'
#> The following object is masked from 'package:raster':
#>
#> getValues
#> Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
#> Future development will only happen in 'mlr3'
#> (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
#> uncaught bugs meanwhile in {mlr} - please consider switching.
#>
#> Attaching package: 'mlr'
#> The following object is masked from 'package:caret':
#>
#> train
#> The following object is masked from 'package:raster':
#>
#> resample
library(DT)Load final seleciton of variable dataset
## 50% of the sample size
smp_size0.5_Eureka <- floor(0.5 * nrow(EurekaAll_Brick_df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind_Eureka <- sample(seq_len(nrow(EurekaAll_Brick_df)), size = smp_size0.5_Eureka)
train0.5_Eureka <- EurekaAll_Brick_df[train_ind_Eureka, ]
test0.5_Eureka <- EurekaAll_Brick_df[-train_ind_Eureka, ]Make rpart model with the train dataset using ‘HigherThan1km’ as the response variable.
Pruning the decision tree: Choose best cp by minimal xerror
printcp(Train0.5.rpart_Eureka)
#>
#> Classification tree:
#> rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology +
#> Temp + DistTrans + DistFault + Faults + DistPlateBound +
#> Precip + DistTrench + DistVolc + Volc + Trans + Trench +
#> PlateBound, data = train0.5_Eureka, method = "class", cp = 1e-04)
#>
#> Variables actually used in tree construction:
#> [1] DistFault DistPlateBound DistTrans DistTrench DistVolc
#> [6] GeoMean Lithology Precip Temp
#>
#> Root node error: 2186/90000 = 0.024289
#>
#> n= 90000
#>
#> CP nsplit rel error xerror xstd
#> 1 0.74290942 0 1.00000 1.00000 0.0211269
#> 2 0.02927722 1 0.25709 0.26349 0.0109438
#> 3 0.01440988 3 0.19854 0.20311 0.0096154
#> 4 0.00411711 5 0.16972 0.17612 0.0089567
#> 5 0.00320220 6 0.16560 0.17521 0.0089335
#> 6 0.00251601 7 0.16240 0.17292 0.0088753
#> 7 0.00228728 11 0.15233 0.17246 0.0088636
#> 8 0.00182983 12 0.15005 0.17246 0.0088636
#> 9 0.00167734 15 0.14456 0.17338 0.0088870
#> 10 0.00160110 18 0.13952 0.17383 0.0088986
#> 11 0.00152486 20 0.13632 0.17429 0.0089103
#> 12 0.00137237 27 0.12489 0.16972 0.0087931
#> 13 0.00091491 33 0.11665 0.16377 0.0086383
#> 14 0.00045746 42 0.10750 0.16789 0.0087457
#> 15 0.00022873 52 0.10293 0.16972 0.0087931
#> 16 0.00010000 58 0.10156 0.17521 0.0089335
plotcp(Train0.5.rpart_Eureka)Make new rpart model with train dataset + good cp
Remove response variable (‘HigherThan1km’) from test dataset for predicting
NewData_test0.5_Eureka <- test0.5_Eureka
NewData_test0.5_Eureka$HigherThan1km <- NULL
datatable(head(NewData_test0.5_Eureka,10),options = list(pageLength=10, pageWidth = 5), autoHideNavigation = TRUE)Let the train model predict the response variable (‘HigherThan1km’) with the test dataset
Compare the difference between actual and predicted
Test0.5_df_Eureka <- test0.5_Eureka['HigherThan1km']
#Test0.5_df_Eureka
Predict_0.5_Class_df_Eureka <- as.data.frame(Predict_0.5_Class_Eureka)
#Predict_0.5_Class_df_Eureka
table(Test0.5_df_Eureka)
#> Test0.5_df_Eureka
#> 0 1
#> 87823 2177
table(Predict_0.5_Class_df_Eureka)
#> Predict_0.5_Class_df_Eureka
#> 0 1
#> 87894 2106Calculating confusion matrix components manually
TP <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 1)
TP
#> [1] 1959
FP <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 1)
FP
#> [1] 147
FN <- sum(Test0.5_df_Eureka == 1 & Predict_0.5_Class_df_Eureka == 0)
FN
#> [1] 218
TN <- sum(Test0.5_df_Eureka == 0 & Predict_0.5_Class_df_Eureka == 0)
TN
#> [1] 87676Calculating Confusion Matrix by funtion
confusionMatrix(factor(Test0.5_df_Eureka == 1), factor(Predict_0.5_Class_df_Eureka == 1), dnn = c("Actual", "Predicted"))
#> Confusion Matrix and Statistics
#>
#> Predicted
#> Actual FALSE TRUE
#> FALSE 87676 147
#> TRUE 218 1959
#>
#> Accuracy : 0.9959
#> 95% CI : (0.9955, 0.9963)
#> No Information Rate : 0.9766
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.9127
#>
#> Mcnemar's Test P-Value : 0.0002483
#>
#> Sensitivity : 0.9975
#> Specificity : 0.9302
#> Pos Pred Value : 0.9983
#> Neg Pred Value : 0.8999
#> Prevalence : 0.9766
#> Detection Rate : 0.9742
#> Detection Prevalence : 0.9758
#> Balanced Accuracy : 0.9639
#>
#> 'Positive' Class : FALSE
#> Matthews Correlation Coefficient (MCC) MCC = ((TP * TN) - (FP * FN)) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
Get the rownames of data subsets
rownames_test_0.5_cp_RpartAll <- rownames(Test0.5_df_Eureka)
rownames_test_0.5_cp_numeric_RpartAll <- as.numeric(rownames_test_0.5_cp_RpartAll)Get coordinates of each cell in dataframe
Get the coordinates of the predicted datasubsets
XCor_test_0.5_cp_RpartAll <- Coordinates_df$x[rownames_test_0.5_cp_numeric_RpartAll]
YCor_test_0.5_cp_RpartAll <- Coordinates_df$y[rownames_test_0.5_cp_numeric_RpartAll]Get the coordinates into one file
Get the coordinates and the 0s and 1s into one file.
To raster
Plot
To GeoTIFF
writeRaster(RasterPlot_test_0.5_cp_RpartAll, "RasterPlot_Test0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6##Into raster Predict_0.5_Class_df_Eureka
Get the rownames of data subsets
rownames_pred_0.5_cp_RpartAll <- rownames(Predict_0.5_Class_df_Eureka)
rownames_pred_0.5_cp_numeric_RpartAll <- as.numeric(rownames_pred_0.5_cp_RpartAll)Get coordinates of each cell in dataframe
Get the coordinates of the predicted datasubsets
XCor_pred_0.5_cp_RpartAll <- Coordinates_df$x[rownames_pred_0.5_cp_numeric_RpartAll]
YCor_pred_0.5_cp_RpartAll <- Coordinates_df$y[rownames_pred_0.5_cp_numeric_RpartAll]Get the coordinates into one file.
Get the coordinates and the 0s and 1s into one file.
To raster
Plot
To GeoTIFF
writeRaster(RasterPlot_pred_0.5_cp_RpartAll, "RasterPlot_Pred0.5_cp_RpartAll_Better", format ="GTiff", overwrite = TRUE)
#> Warning in .gd_SetProject(object, ...): NOT UPDATED FOR PROJ >= 6plot(RasterPlot_test_0.5_cp_RpartAll, main="Test_0.5_cp")
plot(RasterPlot_pred_0.5_cp_RpartAll, main="Predicted_0.5_cp")Choose best cp
printcp(Rpart.EurekaAll)
#>
#> Classification tree:
#> rpart(formula = HigherThan1km ~ Land + GeoMean + Lithology +
#> Temp + DistTrans + DistFault + Faults + DistPlateBound +
#> Precip + DistTrench + DistVolc + Volc + Trans + Trench +
#> PlateBound, data = EurekaAll_Brick_df, method = "class",
#> cp = 1e-04)
#>
#> Variables actually used in tree construction:
#> [1] DistFault DistPlateBound DistTrans DistTrench DistVolc
#> [6] GeoMean Lithology Precip Temp
#>
#> Root node error: 4363/180000 = 0.024239
#>
#> n= 180000
#>
#> CP nsplit rel error xerror xstd
#> 1 0.73458629 0 1.000000 1.00000 0.0149547
#> 2 0.03082741 1 0.265414 0.26541 0.0077744
#> 3 0.01822141 3 0.203759 0.20811 0.0068891
#> 4 0.00389640 5 0.167316 0.17580 0.0063341
#> 5 0.00297960 6 0.163420 0.17488 0.0063176
#> 6 0.00229200 8 0.157460 0.16892 0.0062095
#> 7 0.00194820 10 0.152876 0.16617 0.0061590
#> 8 0.00183360 12 0.148980 0.16365 0.0061122
#> 9 0.00143250 14 0.145313 0.16204 0.0060823
#> 10 0.00137520 21 0.133624 0.15769 0.0060004
#> 11 0.00126060 24 0.129498 0.15723 0.0059917
#> 12 0.00114600 26 0.126977 0.15494 0.0059480
#> 13 0.00091680 30 0.121705 0.15081 0.0058686
#> 14 0.00080220 38 0.114371 0.14806 0.0058150
#> 15 0.00076400 40 0.112766 0.14806 0.0058150
#> 16 0.00075309 43 0.110474 0.14554 0.0057655
#> 17 0.00068760 52 0.103598 0.14554 0.0057655
#> 18 0.00061120 54 0.102223 0.14577 0.0057700
#> 19 0.00057300 57 0.100390 0.14692 0.0057925
#> 20 0.00045840 62 0.097410 0.14692 0.0057925
#> 21 0.00022920 66 0.095576 0.14829 0.0058195
#> 22 0.00015280 76 0.093055 0.15196 0.0058907
#> 23 0.00010000 79 0.092597 0.15494 0.0059480
plotcp(Rpart.EurekaAll)Make new rpart model with good cp
rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
under = FALSE, fallen.leaves = TRUE,
digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
cex = NULL, tweak = 1.1,
clip.facs = FALSE, clip.right.labs = TRUE,
snip = FALSE,
box.palette = c("BnGn"), shadow.col = 0)
#> Warning: labs do not fit even at cex 0.15, there may be some overplottingfancyRpartPlot(Rpart.EurekaAll.Final, palettes = c("Reds","Greens"), yesno=1, branch.col="black")
#> Warning: labs do not fit even at cex 0.15, there may be some overplottingpdf('rpartplot10BrownGreen3')
rpart.plot(Rpart.EurekaAll.Final,type = 5, extra = "auto",
under = FALSE, fallen.leaves = TRUE,
digits = 2, varlen = 0, faclen = 0, roundint = TRUE,
cex = NULL, tweak = 1.1,
clip.facs = FALSE, clip.right.labs = TRUE,
snip = FALSE,
box.palette = c("BnGn"), shadow.col = 0)
dev.off()
#> png
#> 2