#load libraries
library(plyr)
## Warning: package 'plyr' was built under R version 4.2.3
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(MASS)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(party)
## Warning: package 'party' was built under R version 4.2.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.2.3
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
## 
##     empty
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.2.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(corrplot)
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.3
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:modeltools':
## 
##     Predict
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:party':
## 
##     where
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pROC)
## Warning: package 'pROC' was built under R version 4.2.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()       masks plyr::arrange()
## ✖ stringr::boundary()    masks strucchange::boundary()
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ purrr::compact()       masks plyr::compact()
## ✖ dplyr::count()         masks plyr::count()
## ✖ dplyr::desc()          masks plyr::desc()
## ✖ dplyr::failwith()      masks plyr::failwith()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::id()            masks plyr::id()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::mutate()        masks plyr::mutate()
## ✖ dplyr::recode()        masks car::recode()
## ✖ dplyr::rename()        masks plyr::rename()
## ✖ dplyr::select()        masks MASS::select()
## ✖ purrr::some()          masks car::some()
## ✖ dplyr::summarise()     masks plyr::summarise()
## ✖ dplyr::summarize()     masks plyr::summarize()
## ✖ dplyr::where()         masks party::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(mboost)
## Warning: package 'mboost' was built under R version 4.2.3
## Loading required package: parallel
## Loading required package: stabs
## Warning: package 'stabs' was built under R version 4.2.3
## 
## Attaching package: 'stabs'
## 
## The following object is masked from 'package:modeltools':
## 
##     parameters
## 
## 
## Attaching package: 'mboost'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:party':
## 
##     varimp
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
library(ggalluvial)
## Warning: package 'ggalluvial' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
land <- read.csv("landslide-prediction.csv")
land$X_id <- NULL
land <- clean_names(land)

dim(land)
## [1] 1212   13
glimpse(land)
## Rows: 1,212
## Columns: 13
## $ landslide     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ aspect        <int> 3, 1, 3, 1, 5, 5, 1, 3, 3, 1, 3, 5, 3, 4, 5, 4, 5, 5, 5,…
## $ curvature     <int> 3, 5, 4, 3, 4, 5, 3, 5, 2, 4, 3, 5, 2, 3, 2, 3, 4, 3, 1,…
## $ earthquake    <int> 2, 2, 3, 3, 2, 2, 2, 3, 3, 3, 3, 1, 3, 2, 1, 3, 2, 2, 2,…
## $ elevation     <int> 2, 3, 2, 3, 1, 2, 2, 4, 3, 2, 2, 2, 3, 1, 1, 3, 2, 1, 3,…
## $ flow          <int> 2, 1, 2, 5, 4, 3, 4, 2, 2, 4, 2, 3, 2, 2, 3, 3, 3, 3, 3,…
## $ lithology     <int> 1, 1, 4, 1, 1, 1, 1, 2, 6, 1, 1, 1, 3, 1, 3, 1, 3, 1, 1,…
## $ ndvi          <int> 4, 4, 3, 2, 2, 2, 3, 3, 4, 2, 2, 2, 4, 4, 1, 2, 3, 2, 1,…
## $ ndwi          <int> 2, 2, 2, 4, 4, 4, 4, 3, 2, 4, 3, 4, 2, 2, 5, 4, 3, 4, 5,…
## $ plan          <int> 2, 5, 4, 3, 3, 5, 3, 5, 2, 4, 3, 5, 3, 4, 3, 3, 4, 3, 1,…
## $ precipitation <int> 3, 5, 5, 5, 3, 3, 5, 5, 5, 5, 5, 2, 4, 4, 2, 5, 2, 4, 3,…
## $ profile       <int> 3, 2, 2, 3, 1, 2, 3, 2, 4, 2, 3, 2, 4, 4, 4, 3, 2, 3, 5,…
## $ slope         <int> 2, 2, 2, 3, 4, 2, 2, 4, 4, 4, 1, 4, 1, 2, 1, 2, 1, 3, 4,…
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
md.pattern(land, rotate.names = T)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      landslide aspect curvature earthquake elevation flow lithology ndvi ndwi
## 1212         1      1         1          1         1    1         1    1    1
##              0      0         0          0         0    0         0    0    0
##      plan precipitation profile slope  
## 1212    1             1       1     1 0
##         0             0       0     0 0
missmap(land)

#check for duplicate
sum(duplicated(land))
## [1] 14
str(land)
## 'data.frame':    1212 obs. of  13 variables:
##  $ landslide    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ aspect       : int  3 1 3 1 5 5 1 3 3 1 ...
##  $ curvature    : int  3 5 4 3 4 5 3 5 2 4 ...
##  $ earthquake   : int  2 2 3 3 2 2 2 3 3 3 ...
##  $ elevation    : int  2 3 2 3 1 2 2 4 3 2 ...
##  $ flow         : int  2 1 2 5 4 3 4 2 2 4 ...
##  $ lithology    : int  1 1 4 1 1 1 1 2 6 1 ...
##  $ ndvi         : int  4 4 3 2 2 2 3 3 4 2 ...
##  $ ndwi         : int  2 2 2 4 4 4 4 3 2 4 ...
##  $ plan         : int  2 5 4 3 3 5 3 5 2 4 ...
##  $ precipitation: int  3 5 5 5 3 3 5 5 5 5 ...
##  $ profile      : int  3 2 2 3 1 2 3 2 4 2 ...
##  $ slope        : int  2 2 2 3 4 2 2 4 4 4 ...
land$landslide <-   as.factor(land$landslide)
land$aspect <-      as.factor(land$aspect)
land$curvature <-   as.factor(land$curvature)
land$earthquake <-  as.factor(land$earthquake)
land$elevation <-   as.factor(land$elevation)
land$flow <-        as.factor(land$flow)
land$lithology <-   as.factor(land$lithology)
land$ndvi <-        as.factor(land$ndvi)
land$ndwi <-        as.factor(land$ndwi)
land$plan <-        as.factor(land$plan)
land$precipitation <-   as.factor(land$precipitation)
land$profile <-         as.factor(land$profile)
land$slope <-   as.factor(land$slope)

#Exploratory data analysis

landslide_count <- ggplot(land, aes(x = landslide)) +
  geom_bar(color = "red", width = 0.9, fill = "blue") +
  coord_flip() + theme_classic() +
  theme(text = element_text(size = 20, family = "sans")) +
  theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(limits = c("1", "0")) +
  ggtitle("Landslide count") +
  theme(text = element_text(size = 20, face = "bold")) 

landslide_count

land_aspect <- ggplot(land, aes(x = aspect, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(color = "black", width = 0.9) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and Aspect") +
  theme(text = element_text(size = 20, face = "bold"))

land_aspect

land_curvature <- ggplot(land, aes(x = curvature, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and Curvature") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_curvature

land_earthquake <- ggplot(land, aes(x = earthquake, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and  Earthquake") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_earthquake

land_elevation <- ggplot(land, aes(x = elevation, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and  Elevation") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_elevation

land_flow<- ggplot(land, aes(x = flow, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and Flow") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_flow

land_plan <- ggplot(land, aes(x = plan, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and  Plan") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_plan

land_precipitation <- ggplot(land, aes(x = precipitation, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and  Precipitation") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_precipitation

land_profile <- ggplot(land, aes(x = profile, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and Profile") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
land_profile

land_slope <- ggplot(land, aes(x = slope, fill = landslide)) + 
  scale_fill_manual(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  geom_bar(values = c("1" = "#BB2936","0"="#2A7BBA")) +
  theme_classic() +
  theme(text = element_text(size = 20, family =  "sans")) +
  #theme(legend.position = "none") +
  scale_y_continuous(name = " ", limits = c(0,700)) +
  scale_x_discrete(name = " ") +
  ggtitle("Landslide and  Elevation") +
  theme(text = element_text(size = 20, face = "bold"))
## Warning in geom_bar(values = c(`1` = "#BB2936", `0` = "#2A7BBA")): Ignoring
## unknown parameters: `values`
grid.arrange(
landslide_count, land_aspect,land_curvature,land_earthquake,land_elevation, land_flow, land_plan, land_precipitation, land_profile, land_slope, ncol = 5, nrow = 2)

#Logistic Regression split to train and test

set.seed(123)

split <- sample.split(land, SplitRatio = 0.8)
train <- subset(land, split == "TRUE")
test <- subset(land, split == "FALSE")

str(train)
## 'data.frame':    933 obs. of  13 variables:
##  $ landslide    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ aspect       : Factor w/ 5 levels "1","2","3","4",..: 3 1 3 1 5 1 3 1 5 3 ...
##  $ curvature    : Factor w/ 5 levels "1","2","3","4",..: 3 5 4 3 5 3 2 4 5 2 ...
##  $ earthquake   : Factor w/ 3 levels "1","2","3": 2 2 3 3 2 2 3 3 1 3 ...
##  $ elevation    : Factor w/ 5 levels "1","2","3","4",..: 2 3 2 3 2 2 3 2 2 3 ...
##  $ flow         : Factor w/ 5 levels "1","2","3","4",..: 2 1 2 5 3 4 2 4 3 2 ...
##  $ lithology    : Factor w/ 6 levels "1","2","3","4",..: 1 1 4 1 1 1 6 1 1 3 ...
##  $ ndvi         : Factor w/ 5 levels "1","2","3","4",..: 4 4 3 2 2 3 4 2 2 4 ...
##  $ ndwi         : Factor w/ 5 levels "1","2","3","4",..: 2 2 2 4 4 4 2 4 4 2 ...
##  $ plan         : Factor w/ 5 levels "1","2","3","4",..: 2 5 4 3 5 3 2 4 5 3 ...
##  $ precipitation: Factor w/ 5 levels "1","2","3","4",..: 3 5 5 5 3 5 5 5 2 4 ...
##  $ profile      : Factor w/ 5 levels "1","2","3","4",..: 3 2 2 3 2 3 4 2 2 4 ...
##  $ slope        : Factor w/ 5 levels "1","2","3","4",..: 2 2 2 3 2 2 4 4 4 1 ...
str(test)
## 'data.frame':    279 obs. of  13 variables:
##  $ landslide    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ aspect       : Factor w/ 5 levels "1","2","3","4",..: 5 3 3 5 1 1 1 4 1 1 ...
##  $ curvature    : Factor w/ 5 levels "1","2","3","4",..: 4 5 3 3 2 4 4 2 2 5 ...
##  $ earthquake   : Factor w/ 3 levels "1","2","3": 2 3 3 2 1 1 3 2 3 3 ...
##  $ elevation    : Factor w/ 5 levels "1","2","3","4",..: 1 4 2 1 5 1 4 3 1 4 ...
##  $ flow         : Factor w/ 5 levels "1","2","3","4",..: 4 2 2 3 5 5 4 3 4 5 ...
##  $ lithology    : Factor w/ 6 levels "1","2","3","4",..: 1 2 1 1 3 1 3 4 1 6 ...
##  $ ndvi         : Factor w/ 5 levels "1","2","3","4",..: 2 3 2 2 1 4 1 4 1 2 ...
##  $ ndwi         : Factor w/ 5 levels "1","2","3","4",..: 4 3 3 4 4 2 5 2 5 4 ...
##  $ plan         : Factor w/ 5 levels "1","2","3","4",..: 3 5 3 3 2 4 3 2 3 5 ...
##  $ precipitation: Factor w/ 5 levels "1","2","3","4",..: 3 5 5 4 1 1 5 3 5 5 ...
##  $ profile      : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 3 4 3 2 4 5 2 ...
##  $ slope        : Factor w/ 5 levels "1","2","3","4",..: 4 4 1 3 4 2 5 4 3 3 ...

#Check for Multicolinearity on Numeric Variables Visualizing using correlation Plot

# land2 <- read.csv("landslide-prediction.csv")
# 
# numeric_var <- which(sapply(land2, is.numeric))
# numeric_var_names <- names(numeric_var)
# cat("There are", length(numeric_var), "numeric variables")
# 
# land_numvar <- land2[, numeric_var]
# cor_numvar <- cor(land_numvar, use = "pairwise.complete.obs")
# library(dplyr)
# 
# cor_sorted <- as.matrix(sort(cor_numvar[, 'earthquake'], decreasing = TRUE))
# 
# #select only high correlation
# corHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.5)))
# cor_numVar <- cor_numvar[corHigh, corHigh]
# 
# corrplot.mixed(cor_numVar, tl.col = "black", tl.pos= "lt")

# numerics <- unlist(lapply(train, is.numeric))
# numerics <- train[,numerics]
# 
# cor.mtest <- function(mat, ...) {
#   mat <- as.matrix(mat)
#   n <- ncol(mat)
#   p.mat<- matrix(NA, n, n)
#   diag(p.mat) <- 0
#   for (i in 1:(n - 1)) {
#     for (j in (i + 1):n) {
#       tmp <- cor.test(mat[, i], mat[, j], ...)
#       p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
#     }
#   }
#   colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
#   p.mat
# }
#  M <- cor(numerics)
#  p.mat <- cor.mtest(numerics)
# 
#  title <- "Correlation Plot of Numeric Features"
#  col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#  corrplot(M, method="color", col=col(200),
#           diag=FALSE,
#           type="lower", order="hclust",
#           title = title,
#          addCoef.col = "black",
#          p.mat = p.mat, sig.level = 0.05, insig = "blank",
#            mar=c(0,0,1,0) 
#   )
# 

We eliminate variables with high Variannce Inflation Factor (VIF). VIF is one measure to detect multicollinearities between the independent variables of your model. The general idea behind is to create a linear model with one particular variable as your dependent variable and all the others as your independent variable. If there is a goodness of the fit, then there is likely to be a multicollinearity issue, which is represented by a high VIF value. In practice, a VIF of > 5 represents multicollinearity.

 dummy_model <- glm(landslide ~., data = train, family = "binomial")
print(vif(dummy_model))
##                     GVIF Df GVIF^(1/(2*Df))
## aspect         12.344100  4        1.369091
## curvature      50.710307  4        1.633567
## earthquake     20.787125  2        2.135249
## elevation       2.932325  4        1.143935
## flow           10.313172  4        1.338672
## lithology       2.566810  5        1.098852
## ndvi           97.259634  4        1.772114
## ndwi          101.043609  4        1.780589
## plan           13.813425  4        1.388474
## precipitation  24.401020  4        1.490823
## profile        11.675980  4        1.359602
## slope           1.951458  4        1.087164
# train <- subset(train, select = -c(curvature, ndvi,  ndwi))
# test <- subset(test, select = c(curvature, ndvi,  ndwi))

check for new vifs

dummy_model2 <- glm(train$landslide ~., data = train, family = "binomial")
print(vif(dummy_model2))
##                     GVIF Df GVIF^(1/(2*Df))
## aspect         12.344100  4        1.369091
## curvature      50.710307  4        1.633567
## earthquake     20.787125  2        2.135249
## elevation       2.932325  4        1.143935
## flow           10.313172  4        1.338672
## lithology       2.566810  5        1.098852
## ndvi           97.259634  4        1.772114
## ndwi          101.043609  4        1.780589
## plan           13.813425  4        1.388474
## precipitation  24.401020  4        1.490823
## profile        11.675980  4        1.359602
## slope           1.951458  4        1.087164

Now proceed to feature selection

For feature selection, we’re gonna use the variable importance plot from the random forest library. I find this an efficient and reliable way of removing some of the variables to create a better model

set.seed(223)
VariableImportancePlot <- randomForest(as.factor(landslide) ~ ., data = train, importance = TRUE)

varImpPlot(VariableImportancePlot)

#Create Model

finalGlm <- glm(landslide ~., data = train, family = "binomial")
print(finalGlm)
## 
## Call:  glm(formula = landslide ~ ., family = "binomial", data = train)
## 
## Coefficients:
##    (Intercept)         aspect2         aspect3         aspect4         aspect5  
##        1.89536         0.50486         1.13768         0.52510        -1.03823  
##     curvature2      curvature3      curvature4      curvature5     earthquake2  
##        0.37626        -0.34836        -0.06589         0.15204        -0.17319  
##    earthquake3      elevation2      elevation3      elevation4      elevation5  
##       -0.79742        -1.65640        -1.49520        -1.88389        -1.26077  
##          flow2           flow3           flow4           flow5      lithology2  
##       -0.28939        -0.24033        -1.32298        -1.03326         1.79986  
##     lithology3      lithology4      lithology5      lithology6           ndvi2  
##       -0.31003         0.16508         0.23126        -2.73632        -0.54939  
##          ndvi3           ndvi4           ndvi5           ndwi2           ndwi3  
##       -1.45413        -1.99636        -3.38675        -0.04693        -0.88155  
##          ndwi4           ndwi5           plan2           plan3           plan4  
##       -2.21293        -2.20137        -1.11835        -0.77786        -0.86048  
##          plan5  precipitation2  precipitation3  precipitation4  precipitation5  
##       -0.78751         0.18370         1.11133         1.74518         2.19267  
##       profile2        profile3        profile4        profile5          slope2  
##        0.32379        -0.07603         0.31675         1.82235         0.13060  
##         slope3          slope4          slope5  
##        0.73224         1.36879         1.88090  
## 
## Degrees of Freedom: 932 Total (i.e. Null);  885 Residual
## Null Deviance:       1293 
## Residual Deviance: 815.3     AIC: 911.3
summary(finalGlm)
## 
## Call:
## glm(formula = landslide ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7399  -0.6769   0.0758   0.6566   3.2849  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.89536    1.30403   1.453 0.146096    
## aspect2         0.50486    0.48072   1.050 0.293620    
## aspect3         1.13768    0.49821   2.284 0.022398 *  
## aspect4         0.52510    0.50946   1.031 0.302678    
## aspect5        -1.03823    0.51517  -2.015 0.043871 *  
## curvature2      0.37626    0.52196   0.721 0.470999    
## curvature3     -0.34836    0.67575  -0.516 0.606194    
## curvature4     -0.06589    0.81933  -0.080 0.935906    
## curvature5      0.15204    1.02496   0.148 0.882075    
## earthquake2    -0.17319    0.72721  -0.238 0.811756    
## earthquake3    -0.79742    0.79171  -1.007 0.313830    
## elevation2     -1.65640    0.25800  -6.420 1.36e-10 ***
## elevation3     -1.49520    0.28568  -5.234 1.66e-07 ***
## elevation4     -1.88389    0.32055  -5.877 4.18e-09 ***
## elevation5     -1.26077    0.53447  -2.359 0.018329 *  
## flow2          -0.28939    0.31570  -0.917 0.359319    
## flow3          -0.24033    0.42014  -0.572 0.567310    
## flow4          -1.32298    0.57899  -2.285 0.022313 *  
## flow5          -1.03326    0.54785  -1.886 0.059290 .  
## lithology2      1.79986    0.52256   3.444 0.000573 ***
## lithology3     -0.31003    0.27477  -1.128 0.259176    
## lithology4      0.16508    0.32487   0.508 0.611350    
## lithology5      0.23126    0.53367   0.433 0.664759    
## lithology6     -2.73632    0.84990  -3.220 0.001284 ** 
## ndvi2          -0.54939    0.56220  -0.977 0.328461    
## ndvi3          -1.45413    0.72448  -2.007 0.044736 *  
## ndvi4          -1.99636    0.81920  -2.437 0.014811 *  
## ndvi5          -3.38675    0.90072  -3.760 0.000170 ***
## ndwi2          -0.04693    0.35486  -0.132 0.894784    
## ndwi3          -0.88155    0.53150  -1.659 0.097196 .  
## ndwi4          -2.21293    0.70959  -3.119 0.001817 ** 
## ndwi5          -2.20137    0.91994  -2.393 0.016714 *  
## plan2          -1.11835    0.49140  -2.276 0.022856 *  
## plan3          -0.77786    0.56427  -1.379 0.168038    
## plan4          -0.86048    0.63817  -1.348 0.177546    
## plan5          -0.78751    0.77873  -1.011 0.311884    
## precipitation2  0.18370    0.46673   0.394 0.693886    
## precipitation3  1.11133    0.82590   1.346 0.178435    
## precipitation4  1.74518    0.81403   2.144 0.032044 *  
## precipitation5  2.19267    0.82878   2.646 0.008153 ** 
## profile2        0.32379    0.54258   0.597 0.550675    
## profile3       -0.07603    0.60084  -0.127 0.899302    
## profile4        0.31675    0.65628   0.483 0.629352    
## profile5        1.82235    0.80476   2.264 0.023544 *  
## slope2          0.13060    0.29179   0.448 0.654467    
## slope3          0.73224    0.30053   2.436 0.014831 *  
## slope4          1.36879    0.31725   4.315 1.60e-05 ***
## slope5          1.88090    0.45492   4.135 3.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1293.41  on 932  degrees of freedom
## Residual deviance:  815.32  on 885  degrees of freedom
## AIC: 911.32
## 
## Number of Fisher Scoring iterations: 5

#Predict on Test Set We set the threshold normally to be 0.5. But this can be adjusted.

thresh <- 0.5
predictlandslideNumLog <- predict(finalGlm, newdata = test, type='response')
predictlandslidelog <- ifelse(predictlandslideNumLog > thresh, 1, 0)

test$predicteclandslide <- predictlandslidelog

#Evaluate the model Confusion Matrix

cm <- confusionMatrix(table(test$landslide, test$predicteclandslide))
test$predicteclandslide <- as.factor(test$predicteclandslide)

table <- data.frame(confusionMatrix(test$landslide, test$predicteclandslide)$table)
print(cm)
## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 107  33
##   1  39 100
##                                           
##                Accuracy : 0.7419          
##                  95% CI : (0.6864, 0.7923)
##     No Information Rate : 0.5233          
##     P-Value [Acc > NIR] : 5.569e-14       
##                                           
##                   Kappa : 0.4838          
##                                           
##  Mcnemar's Test P-Value : 0.5557          
##                                           
##             Sensitivity : 0.7329          
##             Specificity : 0.7519          
##          Pos Pred Value : 0.7643          
##          Neg Pred Value : 0.7194          
##              Prevalence : 0.5233          
##          Detection Rate : 0.3835          
##    Detection Prevalence : 0.5018          
##       Balanced Accuracy : 0.7424          
##                                           
##        'Positive' Class : 0               
## 
print(cm$byClass)
##          Sensitivity          Specificity       Pos Pred Value 
##            0.7328767            0.7518797            0.7642857 
##       Neg Pred Value            Precision               Recall 
##            0.7194245            0.7642857            0.7328767 
##                   F1           Prevalence       Detection Rate 
##            0.7482517            0.5232975            0.3835125 
## Detection Prevalence    Balanced Accuracy 
##            0.5017921            0.7423782
plotTable <- table %>% 
  mutate(goodbad = ifelse(table$Prediction == table$Reference, "Good", "Bad")) %>% 
  group_by(Reference) %>% 
  mutate(prop = Freq / sum(Freq))
plotTable
## # A tibble: 4 × 5
## # Groups:   Reference [2]
##   Prediction Reference  Freq goodbad  prop
##   <fct>      <fct>     <int> <chr>   <dbl>
## 1 0          0           107 Good    0.733
## 2 1          0            39 Bad     0.267
## 3 0          1            33 Bad     0.248
## 4 1          1           100 Good    0.752
confusionMatrix <- ggplot(data = plotTable, mapping = aes(x = Reference,  y = Prediction, fill = goodbad, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold", alpha = 25, size = 8) +
  scale_fill_manual(name = " ", values = c(Good = "#2FC70A", Bad = "#F8100C")) +
  scale_alpha(name = " ") + 
  theme_classic()+
   xlim(rev(levels(table$Reference))) +
  scale_y_discrete(name = "Predicted", limits = c("1","0")) + 
  scale_x_discrete(name = "Actual", position = "top") +
  theme(text=element_text(size=25,  family="sans")) + 
  ggtitle("Confusion Matrix") +
  theme(plot.title = element_text(size = 25, family="sans", face = "bold"))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
confusionMatrix

#ROC Curve & AUC¶ When dealing with a highly imbalanced dataset (wherein the number of 0’s and 1’s differs by a large margin), relying with accuracy as the sole measure for model performance is not a usually a good idea. A better alternative is the Receiver Operating Characteristic Curve (ROC Curve), and the area under it (AUC). ROC is a plot of signal (True Positive Rate) against noise (False Positive Rate). The model performance is determined by looking at the area under the ROC curve (or AUC). In simple terms, the higher the AUC, the better your model is at classifying 1 as 1 and 0 as 0.

ROCLog <- roc(test$landslide, predictlandslideNumLog)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(auc(ROCLog))
## Area under the curve: 0.8174
plot(ROCLog, col = "#02babc", family = "sans", cex = 2, main = "Logistic Regression Model = ROC CUrve")