#load libraries
library(plyr)
## Warning: package 'plyr' was built under R version 4.2.3
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.3
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(MASS)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(party)
## Warning: package 'party' was built under R version 4.2.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.2.3
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
##
## empty
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.2.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.3
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
## The following object is masked from 'package:modeltools':
##
## Predict
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:party':
##
## where
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Warning: package 'pROC' was built under R version 4.2.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ stringr::boundary() masks strucchange::boundary()
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ dplyr::where() masks party::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(mboost)
## Warning: package 'mboost' was built under R version 4.2.3
## Loading required package: parallel
## Loading required package: stabs
## Warning: package 'stabs' was built under R version 4.2.3
##
## Attaching package: 'stabs'
##
## The following object is masked from 'package:modeltools':
##
## parameters
##
##
## Attaching package: 'mboost'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following object is masked from 'package:party':
##
## varimp
##
## The following object is masked from 'package:ggplot2':
##
## %+%
library(ggalluvial)
## Warning: package 'ggalluvial' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:randomForest':
##
## combine
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
land <- read.csv("landslide-prediction.csv")
land$X_id <- NULL
land <- clean_names(land)
dim(land)
## [1] 1212 13
glimpse(land)
## Rows: 1,212
## Columns: 13
## $ landslide <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ aspect <int> 3, 1, 3, 1, 5, 5, 1, 3, 3, 1, 3, 5, 3, 4, 5, 4, 5, 5, 5,…
## $ curvature <int> 3, 5, 4, 3, 4, 5, 3, 5, 2, 4, 3, 5, 2, 3, 2, 3, 4, 3, 1,…
## $ earthquake <int> 2, 2, 3, 3, 2, 2, 2, 3, 3, 3, 3, 1, 3, 2, 1, 3, 2, 2, 2,…
## $ elevation <int> 2, 3, 2, 3, 1, 2, 2, 4, 3, 2, 2, 2, 3, 1, 1, 3, 2, 1, 3,…
## $ flow <int> 2, 1, 2, 5, 4, 3, 4, 2, 2, 4, 2, 3, 2, 2, 3, 3, 3, 3, 3,…
## $ lithology <int> 1, 1, 4, 1, 1, 1, 1, 2, 6, 1, 1, 1, 3, 1, 3, 1, 3, 1, 1,…
## $ ndvi <int> 4, 4, 3, 2, 2, 2, 3, 3, 4, 2, 2, 2, 4, 4, 1, 2, 3, 2, 1,…
## $ ndwi <int> 2, 2, 2, 4, 4, 4, 4, 3, 2, 4, 3, 4, 2, 2, 5, 4, 3, 4, 5,…
## $ plan <int> 2, 5, 4, 3, 3, 5, 3, 5, 2, 4, 3, 5, 3, 4, 3, 3, 4, 3, 1,…
## $ precipitation <int> 3, 5, 5, 5, 3, 3, 5, 5, 5, 5, 5, 2, 4, 4, 2, 5, 2, 4, 3,…
## $ profile <int> 3, 2, 2, 3, 1, 2, 3, 2, 4, 2, 3, 2, 4, 4, 4, 3, 2, 3, 5,…
## $ slope <int> 2, 2, 2, 3, 4, 2, 2, 4, 4, 4, 1, 4, 1, 2, 1, 2, 1, 3, 4,…
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(land, rotate.names = T)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## landslide aspect curvature earthquake elevation flow lithology ndvi ndwi
## 1212 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0
## plan precipitation profile slope
## 1212 1 1 1 1 0
## 0 0 0 0 0
missmap(land)
#check for duplicate
sum(duplicated(land))
## [1] 14
str(land)
## 'data.frame': 1212 obs. of 13 variables:
## $ landslide : int 0 0 0 0 0 0 0 0 0 0 ...
## $ aspect : int 3 1 3 1 5 5 1 3 3 1 ...
## $ curvature : int 3 5 4 3 4 5 3 5 2 4 ...
## $ earthquake : int 2 2 3 3 2 2 2 3 3 3 ...
## $ elevation : int 2 3 2 3 1 2 2 4 3 2 ...
## $ flow : int 2 1 2 5 4 3 4 2 2 4 ...
## $ lithology : int 1 1 4 1 1 1 1 2 6 1 ...
## $ ndvi : int 4 4 3 2 2 2 3 3 4 2 ...
## $ ndwi : int 2 2 2 4 4 4 4 3 2 4 ...
## $ plan : int 2 5 4 3 3 5 3 5 2 4 ...
## $ precipitation: int 3 5 5 5 3 3 5 5 5 5 ...
## $ profile : int 3 2 2 3 1 2 3 2 4 2 ...
## $ slope : int 2 2 2 3 4 2 2 4 4 4 ...
land$landslide <- as.factor(land$landslide)
land$aspect <- as.factor(land$aspect)
land$curvature <- as.factor(land$curvature)
land$earthquake <- as.factor(land$earthquake)
land$elevation <- as.factor(land$elevation)
land$flow <- as.factor(land$flow)
land$lithology <- as.factor(land$lithology)
land$ndvi <- as.factor(land$ndvi)
land$ndwi <- as.factor(land$ndwi)
land$plan <- as.factor(land$plan)
land$precipitation <- as.factor(land$precipitation)
land$profile <- as.factor(land$profile)
land$slope <- as.factor(land$slope)
#Exploratory data analysis
landslide_count <- ggplot(land, aes(x = landslide)) +
geom_bar(color = "red", width = 0.9, fill = "blue") +
coord_flip() + theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(limits = c("1", "0")) +
ggtitle("Landslide count") +
theme(text = element_text(size = 20, face = "bold"))
landslide_count
land_aspect <- ggplot(land, aes(x = aspect, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(color = "black", width = 0.9) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Aspect") +
theme(text = element_text(size = 20, face = "bold"))
land_aspect
land_curvature <- ggplot(land, aes(x = curvature, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Curvature") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_curvature
land_earthquake <- ggplot(land, aes(x = earthquake, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Earthquake") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_earthquake
land_elevation <- ggplot(land, aes(x = elevation, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Elevation") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_elevation
land_flow<- ggplot(land, aes(x = flow, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Flow") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_flow
land_plan <- ggplot(land, aes(x = plan, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Plan") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_plan
land_precipitation <- ggplot(land, aes(x = precipitation, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Precipitation") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_precipitation
land_profile <- ggplot(land, aes(x = profile, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Profile") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_profile
land_slope <- ggplot(land, aes(x = slope, fill = landslide)) +
scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
theme_classic() +
theme(text = element_text(size = 20, family = "sans")) +
#theme(legend.position = "none") +
scale_y_continuous(name = " ", limits = c(0,700)) +
scale_x_discrete(name = " ") +
ggtitle("Landslide and Elevation") +
theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
grid.arrange(
landslide_count, land_aspect,land_curvature,land_earthquake,land_elevation, land_flow, land_plan, land_precipitation, land_profile, land_slope, ncol = 5, nrow = 2)
#Logistic Regression split to train and test
set.seed(123)
split <- sample.split(land, SplitRatio = 0.8)
train <- subset(land, split == "TRUE")
test <- subset(land, split == "FALSE")
str(train)
## 'data.frame': 933 obs. of 13 variables:
## $ landslide : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ aspect : Factor w/ 5 levels "1","2","3","4",..: 3 1 3 1 5 1 3 1 5 3 ...
## $ curvature : Factor w/ 5 levels "1","2","3","4",..: 3 5 4 3 5 3 2 4 5 2 ...
## $ earthquake : Factor w/ 3 levels "1","2","3": 2 2 3 3 2 2 3 3 1 3 ...
## $ elevation : Factor w/ 5 levels "1","2","3","4",..: 2 3 2 3 2 2 3 2 2 3 ...
## $ flow : Factor w/ 5 levels "1","2","3","4",..: 2 1 2 5 3 4 2 4 3 2 ...
## $ lithology : Factor w/ 6 levels "1","2","3","4",..: 1 1 4 1 1 1 6 1 1 3 ...
## $ ndvi : Factor w/ 5 levels "1","2","3","4",..: 4 4 3 2 2 3 4 2 2 4 ...
## $ ndwi : Factor w/ 5 levels "1","2","3","4",..: 2 2 2 4 4 4 2 4 4 2 ...
## $ plan : Factor w/ 5 levels "1","2","3","4",..: 2 5 4 3 5 3 2 4 5 3 ...
## $ precipitation: Factor w/ 5 levels "1","2","3","4",..: 3 5 5 5 3 5 5 5 2 4 ...
## $ profile : Factor w/ 5 levels "1","2","3","4",..: 3 2 2 3 2 3 4 2 2 4 ...
## $ slope : Factor w/ 5 levels "1","2","3","4",..: 2 2 2 3 2 2 4 4 4 1 ...
str(test)
## 'data.frame': 279 obs. of 13 variables:
## $ landslide : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ aspect : Factor w/ 5 levels "1","2","3","4",..: 5 3 3 5 1 1 1 4 1 1 ...
## $ curvature : Factor w/ 5 levels "1","2","3","4",..: 4 5 3 3 2 4 4 2 2 5 ...
## $ earthquake : Factor w/ 3 levels "1","2","3": 2 3 3 2 1 1 3 2 3 3 ...
## $ elevation : Factor w/ 5 levels "1","2","3","4",..: 1 4 2 1 5 1 4 3 1 4 ...
## $ flow : Factor w/ 5 levels "1","2","3","4",..: 4 2 2 3 5 5 4 3 4 5 ...
## $ lithology : Factor w/ 6 levels "1","2","3","4",..: 1 2 1 1 3 1 3 4 1 6 ...
## $ ndvi : Factor w/ 5 levels "1","2","3","4",..: 2 3 2 2 1 4 1 4 1 2 ...
## $ ndwi : Factor w/ 5 levels "1","2","3","4",..: 4 3 3 4 4 2 5 2 5 4 ...
## $ plan : Factor w/ 5 levels "1","2","3","4",..: 3 5 3 3 2 4 3 2 3 5 ...
## $ precipitation: Factor w/ 5 levels "1","2","3","4",..: 3 5 5 4 1 1 5 3 5 5 ...
## $ profile : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 3 4 3 2 4 5 2 ...
## $ slope : Factor w/ 5 levels "1","2","3","4",..: 4 4 1 3 4 2 5 4 3 3 ...
#Check for Multicolinearity on Numeric Variables Visualizing using correlation Plot
# land2 <- read.csv("landslide-prediction.csv")
#
# numeric_var <- which(sapply(land2, is.numeric))
# numeric_var_names <- names(numeric_var)
# cat("There are", length(numeric_var), "numeric variables")
#
# land_numvar <- land2[, numeric_var]
# cor_numvar <- cor(land_numvar, use = "pairwise.complete.obs")
# library(dplyr)
#
# cor_sorted <- as.matrix(sort(cor_numvar[, 'earthquake'], decreasing = TRUE))
#
# #select only high correlation
# corHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.5)))
# cor_numVar <- cor_numvar[corHigh, corHigh]
#
# corrplot.mixed(cor_numVar, tl.col = "black", tl.pos= "lt")
# numerics <- unlist(lapply(train, is.numeric))
# numerics <- train[,numerics]
#
# cor.mtest <- function(mat, ...) {
# mat <- as.matrix(mat)
# n <- ncol(mat)
# p.mat<- matrix(NA, n, n)
# diag(p.mat) <- 0
# for (i in 1:(n - 1)) {
# for (j in (i + 1):n) {
# tmp <- cor.test(mat[, i], mat[, j], ...)
# p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
# }
# }
# colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
# p.mat
# }
# M <- cor(numerics)
# p.mat <- cor.mtest(numerics)
#
# title <- "Correlation Plot of Numeric Features"
# col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
# corrplot(M, method="color", col=col(200),
# diag=FALSE,
# type="lower", order="hclust",
# title = title,
# addCoef.col = "black",
# p.mat = p.mat, sig.level = 0.05, insig = "blank",
# mar=c(0,0,1,0)
# )
#
We eliminate variables with high Variannce Inflation Factor (VIF). VIF is one measure to detect multicollinearities between the independent variables of your model. The general idea behind is to create a linear model with one particular variable as your dependent variable and all the others as your independent variable. If there is a goodness of the fit, then there is likely to be a multicollinearity issue, which is represented by a high VIF value. In practice, a VIF of > 5 represents multicollinearity.
dummy_model <- glm(landslide ~., data = train, family = "binomial")
print(vif(dummy_model))
## GVIF Df GVIF^(1/(2*Df))
## aspect 12.344100 4 1.369091
## curvature 50.710307 4 1.633567
## earthquake 20.787125 2 2.135249
## elevation 2.932325 4 1.143935
## flow 10.313172 4 1.338672
## lithology 2.566810 5 1.098852
## ndvi 97.259634 4 1.772114
## ndwi 101.043609 4 1.780589
## plan 13.813425 4 1.388474
## precipitation 24.401020 4 1.490823
## profile 11.675980 4 1.359602
## slope 1.951458 4 1.087164
# train <- subset(train, select = -c(curvature, ndvi, ndwi))
# test <- subset(test, select = c(curvature, ndvi, ndwi))
check for new vifs
dummy_model2 <- glm(train$landslide ~., data = train, family = "binomial")
print(vif(dummy_model2))
## GVIF Df GVIF^(1/(2*Df))
## aspect 12.344100 4 1.369091
## curvature 50.710307 4 1.633567
## earthquake 20.787125 2 2.135249
## elevation 2.932325 4 1.143935
## flow 10.313172 4 1.338672
## lithology 2.566810 5 1.098852
## ndvi 97.259634 4 1.772114
## ndwi 101.043609 4 1.780589
## plan 13.813425 4 1.388474
## precipitation 24.401020 4 1.490823
## profile 11.675980 4 1.359602
## slope 1.951458 4 1.087164
Now proceed to feature selection
For feature selection, we’re gonna use the variable importance plot from the random forest library. I find this an efficient and reliable way of removing some of the variables to create a better model
set.seed(223)
VariableImportancePlot <- randomForest(as.factor(landslide) ~ ., data = train, importance = TRUE)
varImpPlot(VariableImportancePlot)
#Create Model
finalGlm <- glm(landslide ~., data = train, family = "binomial")
print(finalGlm)
##
## Call: glm(formula = landslide ~ ., family = "binomial", data = train)
##
## Coefficients:
## (Intercept) aspect2 aspect3 aspect4 aspect5
## 1.89536 0.50486 1.13768 0.52510 -1.03823
## curvature2 curvature3 curvature4 curvature5 earthquake2
## 0.37626 -0.34836 -0.06589 0.15204 -0.17319
## earthquake3 elevation2 elevation3 elevation4 elevation5
## -0.79742 -1.65640 -1.49520 -1.88389 -1.26077
## flow2 flow3 flow4 flow5 lithology2
## -0.28939 -0.24033 -1.32298 -1.03326 1.79986
## lithology3 lithology4 lithology5 lithology6 ndvi2
## -0.31003 0.16508 0.23126 -2.73632 -0.54939
## ndvi3 ndvi4 ndvi5 ndwi2 ndwi3
## -1.45413 -1.99636 -3.38675 -0.04693 -0.88155
## ndwi4 ndwi5 plan2 plan3 plan4
## -2.21293 -2.20137 -1.11835 -0.77786 -0.86048
## plan5 precipitation2 precipitation3 precipitation4 precipitation5
## -0.78751 0.18370 1.11133 1.74518 2.19267
## profile2 profile3 profile4 profile5 slope2
## 0.32379 -0.07603 0.31675 1.82235 0.13060
## slope3 slope4 slope5
## 0.73224 1.36879 1.88090
##
## Degrees of Freedom: 932 Total (i.e. Null); 885 Residual
## Null Deviance: 1293
## Residual Deviance: 815.3 AIC: 911.3
summary(finalGlm)
##
## Call:
## glm(formula = landslide ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7399 -0.6769 0.0758 0.6566 3.2849
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.89536 1.30403 1.453 0.146096
## aspect2 0.50486 0.48072 1.050 0.293620
## aspect3 1.13768 0.49821 2.284 0.022398 *
## aspect4 0.52510 0.50946 1.031 0.302678
## aspect5 -1.03823 0.51517 -2.015 0.043871 *
## curvature2 0.37626 0.52196 0.721 0.470999
## curvature3 -0.34836 0.67575 -0.516 0.606194
## curvature4 -0.06589 0.81933 -0.080 0.935906
## curvature5 0.15204 1.02496 0.148 0.882075
## earthquake2 -0.17319 0.72721 -0.238 0.811756
## earthquake3 -0.79742 0.79171 -1.007 0.313830
## elevation2 -1.65640 0.25800 -6.420 1.36e-10 ***
## elevation3 -1.49520 0.28568 -5.234 1.66e-07 ***
## elevation4 -1.88389 0.32055 -5.877 4.18e-09 ***
## elevation5 -1.26077 0.53447 -2.359 0.018329 *
## flow2 -0.28939 0.31570 -0.917 0.359319
## flow3 -0.24033 0.42014 -0.572 0.567310
## flow4 -1.32298 0.57899 -2.285 0.022313 *
## flow5 -1.03326 0.54785 -1.886 0.059290 .
## lithology2 1.79986 0.52256 3.444 0.000573 ***
## lithology3 -0.31003 0.27477 -1.128 0.259176
## lithology4 0.16508 0.32487 0.508 0.611350
## lithology5 0.23126 0.53367 0.433 0.664759
## lithology6 -2.73632 0.84990 -3.220 0.001284 **
## ndvi2 -0.54939 0.56220 -0.977 0.328461
## ndvi3 -1.45413 0.72448 -2.007 0.044736 *
## ndvi4 -1.99636 0.81920 -2.437 0.014811 *
## ndvi5 -3.38675 0.90072 -3.760 0.000170 ***
## ndwi2 -0.04693 0.35486 -0.132 0.894784
## ndwi3 -0.88155 0.53150 -1.659 0.097196 .
## ndwi4 -2.21293 0.70959 -3.119 0.001817 **
## ndwi5 -2.20137 0.91994 -2.393 0.016714 *
## plan2 -1.11835 0.49140 -2.276 0.022856 *
## plan3 -0.77786 0.56427 -1.379 0.168038
## plan4 -0.86048 0.63817 -1.348 0.177546
## plan5 -0.78751 0.77873 -1.011 0.311884
## precipitation2 0.18370 0.46673 0.394 0.693886
## precipitation3 1.11133 0.82590 1.346 0.178435
## precipitation4 1.74518 0.81403 2.144 0.032044 *
## precipitation5 2.19267 0.82878 2.646 0.008153 **
## profile2 0.32379 0.54258 0.597 0.550675
## profile3 -0.07603 0.60084 -0.127 0.899302
## profile4 0.31675 0.65628 0.483 0.629352
## profile5 1.82235 0.80476 2.264 0.023544 *
## slope2 0.13060 0.29179 0.448 0.654467
## slope3 0.73224 0.30053 2.436 0.014831 *
## slope4 1.36879 0.31725 4.315 1.60e-05 ***
## slope5 1.88090 0.45492 4.135 3.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1293.41 on 932 degrees of freedom
## Residual deviance: 815.32 on 885 degrees of freedom
## AIC: 911.32
##
## Number of Fisher Scoring iterations: 5
#Predict on Test Set We set the threshold normally to be 0.5. But this can be adjusted.
thresh <- 0.5
predictlandslideNumLog <- predict(finalGlm, newdata = test, type='response')
predictlandslidelog <- ifelse(predictlandslideNumLog > thresh, 1, 0)
test$predicteclandslide <- predictlandslidelog
#Evaluate the model Confusion Matrix
cm <- confusionMatrix(table(test$landslide, test$predicteclandslide))
test$predicteclandslide <- as.factor(test$predicteclandslide)
table <- data.frame(confusionMatrix(test$landslide, test$predicteclandslide)$table)
print(cm)
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 107 33
## 1 39 100
##
## Accuracy : 0.7419
## 95% CI : (0.6864, 0.7923)
## No Information Rate : 0.5233
## P-Value [Acc > NIR] : 5.569e-14
##
## Kappa : 0.4838
##
## Mcnemar's Test P-Value : 0.5557
##
## Sensitivity : 0.7329
## Specificity : 0.7519
## Pos Pred Value : 0.7643
## Neg Pred Value : 0.7194
## Prevalence : 0.5233
## Detection Rate : 0.3835
## Detection Prevalence : 0.5018
## Balanced Accuracy : 0.7424
##
## 'Positive' Class : 0
##
print(cm$byClass)
## Sensitivity Specificity Pos Pred Value
## 0.7328767 0.7518797 0.7642857
## Neg Pred Value Precision Recall
## 0.7194245 0.7642857 0.7328767
## F1 Prevalence Detection Rate
## 0.7482517 0.5232975 0.3835125
## Detection Prevalence Balanced Accuracy
## 0.5017921 0.7423782
plotTable <- table %>%
mutate(goodbad = ifelse(table$Prediction == table$Reference, "Good", "Bad")) %>%
group_by(Reference) %>%
mutate(prop = Freq / sum(Freq))
plotTable
## # A tibble: 4 × 5
## # Groups: Reference [2]
## Prediction Reference Freq goodbad prop
## <fct> <fct> <int> <chr> <dbl>
## 1 0 0 107 Good 0.733
## 2 1 0 39 Bad 0.267
## 3 0 1 33 Bad 0.248
## 4 1 1 100 Good 0.752
confusionMatrix <- ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = goodbad, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold", alpha = 25, size = 8) +
scale_fill_manual(name = " ", values = c(Good = "#2FC70A", Bad = "#F8100C")) +
scale_alpha(name = " ") +
theme_classic()+
xlim(rev(levels(table$Reference))) +
scale_y_discrete(name = "Predicted", limits = c("1","0")) +
scale_x_discrete(name = "Actual", position = "top") +
theme(text=element_text(size=25, family="sans")) +
ggtitle("Confusion Matrix") +
theme(plot.title = element_text(size = 25, family="sans", face = "bold"))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
confusionMatrix
#ROC Curve & AUC¶ When dealing with a highly imbalanced dataset
(wherein the number of 0’s and 1’s differs by a large margin), relying
with accuracy as the sole measure for model performance is not a usually
a good idea. A better alternative is the Receiver Operating
Characteristic Curve (ROC Curve), and the area under it (AUC). ROC is a
plot of signal (True Positive Rate) against noise (False Positive Rate).
The model performance is determined by looking at the area under the ROC
curve (or AUC). In simple terms, the higher the AUC, the better your
model is at classifying 1 as 1 and 0 as 0.
ROCLog <- roc(test$landslide, predictlandslideNumLog)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(auc(ROCLog))
## Area under the curve: 0.8174
plot(ROCLog, col = "#02babc", family = "sans", cex = 2, main = "Logistic Regression Model = ROC CUrve")