#RANDOM FORESTS
#Decision trees try to find the optimal rule to forecast an outcome based on a sequence of simple decision steps.
remove(list = ls())
setwd("E:/R")
options(digits = 3, scipen = 9999)
# Ver. "2021-01-29"
# JWatt Notes:
#Random forests average the output of many decision trees. Each decision tree is then fit on a
#small subset of training examples or is constrained to use only a small subset of input features. Averaging the output of
#these trees reduces variance of the overall estimator.
#To fit a decision tree, the algorithm usually looks for the best variable and the best splitting value among all possibilities, so
#that a particular loss function is minimized. The loss function can be defined as the impurities in the child nodes, which are
#measured by a Gini index or entropy. Criteria can be used to ensure the tree is interpretable and prevent overfitting, e.g.
# . Max depth: deciding a maximum depth of the tree
# . Node size: at least N observations in each node
#One can also build a large tree with many branches, and then prune the tree by combining subtrees with the lowest trade-off
#in the goodness of fit. Apart from classifications, decision trees can also be used for regressions that predict a continuous
#outcome. In that case, the model is simply a piecewise constant "surface" depending on the thresholds of the explanatory
#variables.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#special libraries
library(mdsr)
library(rpart)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(DescTools)
##
## Attaching package: 'DescTools'
##
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
###########################################################################################
# Read the dataset
cancer = read.csv("wisc_bc_data.csv", header = T,
stringsAsFactors = T)
head(cancer)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 18.0 10.4 122.8 1001
## 2 842517 M 20.6 17.8 132.9 1326
## 3 84300903 M 19.7 21.2 130.0 1203
## 4 84348301 M 11.4 20.4 77.6 386
## 5 84358402 M 20.3 14.3 135.1 1297
## 6 843786 M 12.4 15.7 82.6 477
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.1184 0.2776 0.3001 0.1471
## 2 0.0847 0.0786 0.0869 0.0702
## 3 0.1096 0.1599 0.1974 0.1279
## 4 0.1425 0.2839 0.2414 0.1052
## 5 0.1003 0.1328 0.1980 0.1043
## 6 0.1278 0.1700 0.1578 0.0809
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.242 0.0787 1.095 0.905 8.59
## 2 0.181 0.0567 0.543 0.734 3.40
## 3 0.207 0.0600 0.746 0.787 4.58
## 4 0.260 0.0974 0.496 1.156 3.44
## 5 0.181 0.0588 0.757 0.781 5.44
## 6 0.209 0.0761 0.335 0.890 2.22
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.4 0.00640 0.0490 0.0537 0.0159
## 2 74.1 0.00522 0.0131 0.0186 0.0134
## 3 94.0 0.00615 0.0401 0.0383 0.0206
## 4 27.2 0.00911 0.0746 0.0566 0.0187
## 5 94.4 0.01149 0.0246 0.0569 0.0188
## 6 27.2 0.00751 0.0335 0.0367 0.0114
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.0300 0.00619 25.4 17.3 184.6
## 2 0.0139 0.00353 25.0 23.4 158.8
## 3 0.0225 0.00457 23.6 25.5 152.5
## 4 0.0596 0.00921 14.9 26.5 98.9
## 5 0.0176 0.00511 22.5 16.7 152.2
## 6 0.0216 0.00508 15.5 23.8 103.4
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019 0.162 0.666 0.712
## 2 1956 0.124 0.187 0.242
## 3 1709 0.144 0.424 0.450
## 4 568 0.210 0.866 0.687
## 5 1575 0.137 0.205 0.400
## 6 742 0.179 0.525 0.535
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1 0.265 0.460 0.1189
## 2 0.186 0.275 0.0890
## 3 0.243 0.361 0.0876
## 4 0.258 0.664 0.1730
## 5 0.163 0.236 0.0768
## 6 0.174 0.399 0.1244
set.seed(12345)
#Create training and testing subsets
cancer_df = cancer %>% dplyr::mutate( ID = row_number())
train = cancer_df %>% sample_frac(0.8)
test = cancer_df %>% anti_join(train, by = "ID")
test = test %>% select(-ID, -id)
train = train %>% select(-ID, -id)
#Establish Null Model
names(train)
## [1] "diagnosis" "radius_mean"
## [3] "texture_mean" "perimeter_mean"
## [5] "area_mean" "smoothness_mean"
## [7] "compactness_mean" "concavity_mean"
## [9] "concave.points_mean" "symmetry_mean"
## [11] "fractal_dimension_mean" "radius_se"
## [13] "texture_se" "perimeter_se"
## [15] "area_se" "smoothness_se"
## [17] "compactness_se" "concavity_se"
## [19] "concave.points_se" "symmetry_se"
## [21] "fractal_dimension_se" "radius_worst"
## [23] "texture_worst" "perimeter_worst"
## [25] "area_worst" "smoothness_worst"
## [27] "compactness_worst" "concavity_worst"
## [29] "concave.points_worst" "symmetry_worst"
## [31] "fractal_dimension_worst"
prop.table(table(train$diagnosis))
##
## B M
## 0.637 0.363
#Model
form = as.formula("diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean +
compactness_mean +concavity_mean+concave.points_mean + symmetry_mean+ fractal_dimension_mean")
form = as.formula("diagnosis ~ .")
train$diagnosis = as.factor(train$diagnosis)
mod_forest = randomForest(form,data = train, ntree = 200, mtry = 3)
#mod_forest = randomForest(diagnosis~ radius_mean,data = train, ntree = 200, mtry = 3)
mod_forest
##
## Call:
## randomForest(formula = form, data = train, ntree = 200, mtry = 3)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 3.08%
## Confusion matrix:
## B M class.error
## B 285 5 0.0172
## M 9 156 0.0545
mod_forest_pred = predict(mod_forest, newdata = test)
confusionMatrix(mod_forest_pred, test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 63 4
## M 4 43
##
## Accuracy : 0.93
## 95% CI : (0.866, 0.969)
## No Information Rate : 0.588
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.855
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.940
## Specificity : 0.915
## Pos Pred Value : 0.940
## Neg Pred Value : 0.915
## Prevalence : 0.588
## Detection Rate : 0.553
## Detection Prevalence : 0.588
## Balanced Accuracy : 0.928
##
## 'Positive' Class : B
##
importance(mod_forest) %>% as.data.frame() %>%
rownames_to_column() %>% arrange(desc(MeanDecreaseGini))
## rowname MeanDecreaseGini
## 1 radius_worst 22.111
## 2 area_worst 20.220
## 3 perimeter_worst 19.960
## 4 concave.points_worst 18.188
## 5 concave.points_mean 17.133
## 6 perimeter_mean 14.923
## 7 area_se 14.415
## 8 radius_mean 11.973
## 9 concavity_mean 10.096
## 10 area_mean 9.850
## 11 concavity_worst 9.312
## 12 compactness_worst 4.426
## 13 radius_se 4.340
## 14 perimeter_se 4.002
## 15 texture_worst 3.245
## 16 symmetry_worst 3.142
## 17 compactness_mean 3.035
## 18 smoothness_worst 2.657
## 19 texture_mean 2.407
## 20 concavity_se 2.303
## 21 fractal_dimension_worst 1.814
## 22 concave.points_se 1.542
## 23 smoothness_mean 1.517
## 24 symmetry_se 1.251
## 25 compactness_se 1.219
## 26 texture_se 1.159
## 27 fractal_dimension_se 1.141
## 28 fractal_dimension_mean 0.971
## 29 smoothness_se 0.956
## 30 symmetry_mean 0.759
var_importance = importance(mod_forest) %>% as.data.frame() %>%
rownames_to_column() %>%
arrange( desc(MeanDecreaseGini))
ggplot(var_importance, aes(x = reorder(rowname, MeanDecreaseGini), y = MeanDecreaseGini, fill = rowname)) +
geom_bar(stat = "identity") +
ggtitle("Variable Importance from Random Forest Model") +
xlab("Predictors") + ylab("Variable Importance (Mean Decrease in Gini Index)") +
#scale_fill_discrete(name="Predictor") +
coord_flip()+
theme(legend.position = "none")

##############################
insurance = read.csv("caravan-insurance-challenge.csv", header = T,
stringsAsFactors = T)
head(insurance)
## id MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE
## 1 1 33 1 3 2 8 0 5 1 3
## 2 2 37 1 2 2 8 1 4 1 4
## 3 3 37 1 2 2 8 0 4 2 4
## 4 4 9 1 3 3 3 2 3 2 4
## 5 5 40 1 4 2 10 1 4 1 4
## 6 6 23 1 2 1 5 0 5 0 5
## MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG
## 1 7 0 2 1 2 6 1 2 7
## 2 6 2 2 0 4 5 0 5 4
## 3 3 2 4 4 4 2 0 5 4
## 4 5 2 2 2 3 4 3 4 2
## 5 7 1 2 2 4 4 5 4 0
## 6 0 6 3 3 5 2 0 5 4
## MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC
## 1 1 0 1 2 5 2 1 1 2 6
## 2 0 0 0 5 0 4 0 2 3 5
## 3 0 0 0 7 0 2 0 5 0 4
## 4 4 0 0 3 1 2 3 2 1 4
## 5 0 5 4 0 0 0 9 0 0 0
## 6 2 0 0 4 2 2 2 2 2 4
## MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575
## 1 1 1 8 8 0 1 8 1 0 4 5
## 2 0 2 7 7 1 2 6 3 2 0 5
## 3 0 7 2 7 0 2 9 0 4 5 0
## 4 0 5 4 9 0 0 7 2 1 5 3
## 5 0 4 5 6 2 1 5 4 0 0 9
## 6 2 9 0 5 3 3 9 0 5 2 3
## MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT
## 1 0 0 4 3 0 0 0 6 0
## 2 2 0 5 4 2 0 0 0 0
## 3 0 0 3 4 2 0 0 6 0
## 4 0 0 4 4 0 0 0 6 0
## 5 0 0 6 3 0 0 0 0 0
## 6 0 0 3 3 0 0 0 6 0
## PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR
## 1 0 5 0 0 0 0 0 0 0
## 2 0 2 0 0 0 0 0 2 0
## 3 0 2 0 0 0 0 0 1 0
## 4 0 2 0 0 0 0 0 0 0
## 5 0 6 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM
## 1 0 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 1 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 1 0 0 0 0 0 0 0
## ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED
## 1 0 0 0 0 1 0 0 0 0
## 2 0 0 0 0 1 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0
## 4 0 0 0 0 1 0 0 0 0
## 5 0 0 0 0 1 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## ABYSTAND CARAVAN
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
set.seed(12345)
#Create training and testing subsets
insurance_df = insurance %>% dplyr::mutate( ID = row_number())
train = insurance_df %>% sample_frac(0.6)
test = insurance_df %>% anti_join(train, by = "ID")
test = test %>% select(-ID, -id)
train = train %>% select(-ID, -id)
#Establish Null Model
names(train)
## [1] "MOSTYPE" "MAANTHUI" "MGEMOMV" "MGEMLEEF" "MOSHOOFD" "MGODRK"
## [7] "MGODPR" "MGODOV" "MGODGE" "MRELGE" "MRELSA" "MRELOV"
## [13] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [19] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [25] "MSKA" "MSKB1" "MSKB2" "MSKC" "MSKD" "MHHUUR"
## [31] "MHKOOP" "MAUT1" "MAUT2" "MAUT0" "MZFONDS" "MZPART"
## [37] "MINKM30" "MINK3045" "MINK4575" "MINK7512" "MINK123M" "MINKGEM"
## [43] "MKOOPKLA" "PWAPART" "PWABEDR" "PWALAND" "PPERSAUT" "PBESAUT"
## [49] "PMOTSCO" "PVRAAUT" "PAANHANG" "PTRACTOR" "PWERKT" "PBROM"
## [55] "PLEVEN" "PPERSONG" "PGEZONG" "PWAOREG" "PBRAND" "PZEILPL"
## [61] "PPLEZIER" "PFIETS" "PINBOED" "PBYSTAND" "AWAPART" "AWABEDR"
## [67] "AWALAND" "APERSAUT" "ABESAUT" "AMOTSCO" "AVRAAUT" "AAANHANG"
## [73] "ATRACTOR" "AWERKT" "ABROM" "ALEVEN" "APERSONG" "AGEZONG"
## [79] "AWAOREG" "ABRAND" "AZEILPL" "APLEZIER" "AFIETS" "AINBOED"
## [85] "ABYSTAND" "CARAVAN"
prop.table(table(train$CARAVAN))
##
## 0 1
## 0.9403 0.0597
#Model
form = as.formula("CARAVAN ~ .")
train$CARAVAN = as.factor(train$CARAVAN)
mod_forest = randomForest(form,data = train, ntree = 200, mtry = 5)
mod_forest
##
## Call:
## randomForest(formula = form, data = train, ntree = 200, mtry = 5)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 6.28%
## Confusion matrix:
## 0 1 class.error
## 0 5512 29 0.00523
## 1 341 11 0.96875
#mod_forest_pred = predict(mod_forest, newdata = test)
#confusionMatrix(mod_forest_pred, test$CARAVAN)
importance(mod_forest) %>% as.data.frame() %>%
rownames_to_column() %>% arrange(desc(MeanDecreaseGini))
## rowname MeanDecreaseGini
## 1 PBRAND 17.220113
## 2 MOSTYPE 15.788165
## 3 PPERSAUT 14.478206
## 4 APERSAUT 13.193383
## 5 MKOOPKLA 12.441978
## 6 MBERMIDD 11.587130
## 7 MOSHOOFD 11.038368
## 8 MOPLLAAG 10.963097
## 9 MOPLMIDD 10.873277
## 10 MGODPR 10.435422
## 11 MFWEKIND 10.300690
## 12 MINK3045 10.236031
## 13 MBERARBG 9.874873
## 14 MGODGE 9.870187
## 15 MFGEKIND 9.788908
## 16 MBERARBO 9.537983
## 17 MSKC 9.498346
## 18 MINK4575 9.458944
## 19 MINKM30 9.401708
## 20 MHKOOP 9.380601
## 21 MSKB1 9.338888
## 22 MHHUUR 9.336127
## 23 MZPART 9.210257
## 24 PWAPART 9.192303
## 25 MBERHOOG 9.130984
## 26 MAUT1 9.079483
## 27 MINK7512 9.018692
## 28 MSKB2 8.888134
## 29 MOPLHOOG 8.810627
## 30 MRELGE 8.794985
## 31 MINKGEM 8.707201
## 32 MSKA 8.703573
## 33 MRELOV 8.571931
## 34 MZFONDS 8.556517
## 35 MAUT0 8.242629
## 36 MFALLEEN 8.173773
## 37 MGODOV 7.810161
## 38 MAUT2 7.685528
## 39 ABRAND 6.918440
## 40 MGODRK 6.619712
## 41 AWAPART 6.358411
## 42 MGEMLEEF 6.323388
## 43 MGEMOMV 6.041243
## 44 MSKD 6.029186
## 45 MBERZELF 5.669090
## 46 MRELSA 5.512110
## 47 ALEVEN 5.413133
## 48 APLEZIER 4.733942
## 49 PLEVEN 4.418790
## 50 MBERBOER 4.296431
## 51 PPLEZIER 3.748156
## 52 PBROM 3.423410
## 53 PGEZONG 3.318771
## 54 AFIETS 3.306825
## 55 AMOTSCO 2.993325
## 56 MINK123M 2.956958
## 57 ABROM 2.812934
## 58 PMOTSCO 2.727845
## 59 AGEZONG 2.503577
## 60 MAANTHUI 2.061797
## 61 PFIETS 2.049035
## 62 PTRACTOR 1.964685
## 63 PBYSTAND 1.821742
## 64 ABYSTAND 1.653674
## 65 PWABEDR 1.393949
## 66 AWAOREG 1.225636
## 67 AWABEDR 1.155639
## 68 ATRACTOR 1.111624
## 69 AINBOED 1.098447
## 70 PINBOED 1.066826
## 71 PWAOREG 1.048101
## 72 AAANHANG 0.838060
## 73 ABESAUT 0.833928
## 74 PAANHANG 0.774200
## 75 PBESAUT 0.675029
## 76 PPERSONG 0.538370
## 77 APERSONG 0.520351
## 78 PWALAND 0.422764
## 79 AWALAND 0.332298
## 80 AZEILPL 0.197461
## 81 PZEILPL 0.128503
## 82 PWERKT 0.031075
## 83 AWERKT 0.020037
## 84 PVRAAUT 0.000391
## 85 AVRAAUT 0.000226
var_importance = importance(mod_forest) %>% as.data.frame() %>%
rownames_to_column() %>%
arrange( desc(MeanDecreaseGini))
ggplot(var_importance, aes(x = reorder(rowname, MeanDecreaseGini), y = MeanDecreaseGini, fill = rowname)) +
geom_bar(stat = "identity") +
ggtitle("Variable Importance from Random Forest Model") +
xlab("Predictors") + ylab("Variable Importance (Mean Decrease in Gini Index)") +
#scale_fill_discrete(name="Predictor") +
coord_flip()+
theme(legend.position = "none")

#=========================================================================================
#Testing on credit:
credit = read.csv("UniversalBank.csv", header = T,
stringsAsFactors = T)
head(credit)
## ID Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1 1 25 1 49 91107 4 1.6 1 0
## 2 2 45 19 34 90089 3 1.5 1 0
## 3 3 39 15 11 94720 1 1.0 1 0
## 4 4 35 9 100 94112 1 2.7 2 0
## 5 5 35 8 45 91330 4 1.0 2 0
## 6 6 37 13 29 92121 4 0.4 2 155
## PersonalLoan SecuritiesAccount CDAccount Online CreditCard
## 1 0 1 0 0 0
## 2 0 1 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 1
## 6 0 0 0 1 0
set.seed(12345)
#Create training and testing subsets
credit_df = credit %>% dplyr::mutate( ID = row_number())
train = credit_df %>% sample_frac(0.8)
test = credit_df %>% anti_join(train, by = "ID")
test = test %>% select(-ID)
train = train %>% select(-ID)
#Establish Null Model
names(train)
## [1] "Age" "Experience" "Income"
## [4] "ZIP.Code" "Family" "CCAvg"
## [7] "Education" "Mortgage" "PersonalLoan"
## [10] "SecuritiesAccount" "CDAccount" "Online"
## [13] "CreditCard"
prop.table(table(train$PersonalLoan))
##
## 0 1
## 0.9032 0.0968
#Model
form = as.formula("PersonalLoan ~ Age + Experience + Income + ZIP.Code + Family + CCAvg +
Education + Mortgage + CreditCard + SecuritiesAccount + CDAccount + Online")
train$PersonalLoan = as.factor(train$PersonalLoan)
mod_forest = randomForest(form,data = train, ntree = 200, mtry = 3)
#mod_forest = randomForest(diagnosis~ radius_mean,data = train, ntree = 200, mtry = 3)
mod_forest
##
## Call:
## randomForest(formula = form, data = train, ntree = 200, mtry = 3)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.3%
## Confusion matrix:
## 0 1 class.error
## 0 3605 8 0.00221
## 1 44 343 0.11370
mod_forest_pred = predict(mod_forest, newdata = test)
mod_forest_pred
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
## 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
## 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
## 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
## 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
## 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
## 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
## 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
## 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464
## 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
## 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
## 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
## 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
## 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
## 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
## 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
## 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0
## 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
## 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
## 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0
## 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
## 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0
## 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
## 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
## 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
## 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
## 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912
## 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1
## 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
## 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
## 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
## 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 993 994 995 996 997 998 999 1000
## 0 1 0 0 0 0 0 0
## Levels: 0 1
#confusionMatrix(mod_forest_pred, test$PersonalLoan)
importance(mod_forest) %>% as.data.frame() %>%
rownames_to_column() %>% arrange(desc(MeanDecreaseGini))
## rowname MeanDecreaseGini
## 1 Income 233.71
## 2 Education 139.92
## 3 CCAvg 109.90
## 4 Family 71.31
## 5 CDAccount 37.91
## 6 Age 23.49
## 7 ZIP.Code 21.80
## 8 Experience 21.71
## 9 Mortgage 21.03
## 10 CreditCard 5.44
## 11 Online 4.49
## 12 SecuritiesAccount 2.83
var_importance = importance(mod_forest) %>% as.data.frame() %>%
rownames_to_column() %>%
arrange( desc(MeanDecreaseGini))
ggplot(var_importance, aes(x = reorder(rowname, MeanDecreaseGini), y = MeanDecreaseGini, fill = rowname)) +
geom_bar(stat = "identity") +
ggtitle("Variable Importance from Random Forest Model") +
xlab("Predictors") + ylab("Variable Importance (Mean Decrease in Gini Index)") +
#scale_fill_discrete(name="Predictor") +
coord_flip()+
theme(legend.position = "none")

newcredit = read.csv("newCreditApplicants.csv", header = TRUE, stringsAsFactors = F)
str(newcredit)
## 'data.frame': 20 obs. of 13 variables:
## $ ID : int 3280 309 1544 3076 1989 3751 2957 1586 4053 1604 ...
## $ Age : int 26 32 52 26 52 57 62 57 43 36 ...
## $ Experience : int -1 8 26 0 28 32 38 31 19 6 ...
## $ Income : int 44 128 101 85 18 52 195 131 54 138 ...
## $ ZIP.Code : int 94901 94720 93407 95616 91301 90266 91125 90502 94608 92152 ...
## $ Family : int 1 2 2 2 1 3 4 2 2 1 ...
## $ CCAvg : num 2 4.33 2.4 1.6 0.3 0.5 5.2 2.7 1.7 7 ...
## $ Education : int 2 1 2 3 1 2 3 1 1 3 ...
## $ Mortgage : int 0 0 0 0 120 0 522 0 0 86 ...
## $ SecuritiesAccount: int 0 0 0 0 0 0 0 0 0 0 ...
## $ CDAccount : int 0 1 0 0 0 0 1 0 0 0 ...
## $ Online : int 0 1 1 0 1 1 1 0 1 1 ...
## $ CreditCard : int 0 1 0 0 0 0 1 0 0 0 ...
newcredit_df = newcredit %>% dplyr::mutate( ID = row_number())
mod_forest_pred2 = predict(mod_forest, newdata = newcredit_df)
mod_forest_pred2
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
#confusionMatrix(mod_forest_pred2, test$PersonalLoan)