Random_Forests

#RANDOM FORESTS

#Decision trees try to find the optimal rule to forecast an outcome based on a sequence of simple decision steps.

      remove(list = ls())
      setwd("E:/R")
      options(digits = 3, scipen = 9999)

      # Ver. "2021-01-29"
      # JWatt Notes:

  #Random forests average the output of many decision trees. Each decision tree is then fit on a
  #small subset of training examples or is constrained to use only a small subset of input features. Averaging the output of
  #these trees reduces variance of the overall estimator. 
  
  #To fit a decision tree, the algorithm usually looks for the best variable and the best splitting value among all possibilities, so
  #that a particular loss function is minimized. The loss function can be defined as the impurities in the child nodes, which are
  #measured by a Gini index or entropy. Criteria can be used to ensure the tree is interpretable and prevent overfitting, e.g.
  # . Max depth: deciding a maximum depth of the tree
  # . Node size: at least N observations in each node
  
  #One can also build a large tree with many branches, and then prune the tree by combining subtrees with the lowest trade-off
  #in the goodness of fit. Apart from classifications, decision trees can also be used for regressions that predict a continuous
  #outcome. In that case, the model is simply a piecewise constant "surface" depending on the thresholds of the explanatory
  #variables.

          library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

          #special libraries
          library(mdsr)
          library(rpart)
          library(partykit)

## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm

          library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

          library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

          library(DescTools)

## 
## Attaching package: 'DescTools'
## 
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

          ###########################################################################################
          # Read the dataset
          cancer = read.csv("wisc_bc_data.csv", header = T,
                            stringsAsFactors = T)
          head(cancer)

##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M        18.0         10.4          122.8      1001
## 2   842517         M        20.6         17.8          132.9      1326
## 3 84300903         M        19.7         21.2          130.0      1203
## 4 84348301         M        11.4         20.4           77.6       386
## 5 84358402         M        20.3         14.3          135.1      1297
## 6   843786         M        12.4         15.7           82.6       477
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1          0.1184           0.2776         0.3001              0.1471
## 2          0.0847           0.0786         0.0869              0.0702
## 3          0.1096           0.1599         0.1974              0.1279
## 4          0.1425           0.2839         0.2414              0.1052
## 5          0.1003           0.1328         0.1980              0.1043
## 6          0.1278           0.1700         0.1578              0.0809
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1         0.242                 0.0787     1.095      0.905         8.59
## 2         0.181                 0.0567     0.543      0.734         3.40
## 3         0.207                 0.0600     0.746      0.787         4.58
## 4         0.260                 0.0974     0.496      1.156         3.44
## 5         0.181                 0.0588     0.757      0.781         5.44
## 6         0.209                 0.0761     0.335      0.890         2.22
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1   153.4       0.00640         0.0490       0.0537            0.0159
## 2    74.1       0.00522         0.0131       0.0186            0.0134
## 3    94.0       0.00615         0.0401       0.0383            0.0206
## 4    27.2       0.00911         0.0746       0.0566            0.0187
## 5    94.4       0.01149         0.0246       0.0569            0.0188
## 6    27.2       0.00751         0.0335       0.0367            0.0114
##   symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1      0.0300              0.00619         25.4          17.3           184.6
## 2      0.0139              0.00353         25.0          23.4           158.8
## 3      0.0225              0.00457         23.6          25.5           152.5
## 4      0.0596              0.00921         14.9          26.5            98.9
## 5      0.0176              0.00511         22.5          16.7           152.2
## 6      0.0216              0.00508         15.5          23.8           103.4
##   area_worst smoothness_worst compactness_worst concavity_worst
## 1       2019            0.162             0.666           0.712
## 2       1956            0.124             0.187           0.242
## 3       1709            0.144             0.424           0.450
## 4        568            0.210             0.866           0.687
## 5       1575            0.137             0.205           0.400
## 6        742            0.179             0.525           0.535
##   concave.points_worst symmetry_worst fractal_dimension_worst
## 1                0.265          0.460                  0.1189
## 2                0.186          0.275                  0.0890
## 3                0.243          0.361                  0.0876
## 4                0.258          0.664                  0.1730
## 5                0.163          0.236                  0.0768
## 6                0.174          0.399                  0.1244

          set.seed(12345)
          
          #Create training and testing subsets
          cancer_df = cancer %>% dplyr::mutate( ID = row_number())
          train = cancer_df %>% sample_frac(0.8)
          test = cancer_df %>% anti_join(train, by = "ID")
          
          test = test %>% select(-ID, -id)
          train = train %>% select(-ID, -id)
          
          #Establish Null Model
          names(train)

##  [1] "diagnosis"               "radius_mean"            
##  [3] "texture_mean"            "perimeter_mean"         
##  [5] "area_mean"               "smoothness_mean"        
##  [7] "compactness_mean"        "concavity_mean"         
##  [9] "concave.points_mean"     "symmetry_mean"          
## [11] "fractal_dimension_mean"  "radius_se"              
## [13] "texture_se"              "perimeter_se"           
## [15] "area_se"                 "smoothness_se"          
## [17] "compactness_se"          "concavity_se"           
## [19] "concave.points_se"       "symmetry_se"            
## [21] "fractal_dimension_se"    "radius_worst"           
## [23] "texture_worst"           "perimeter_worst"        
## [25] "area_worst"              "smoothness_worst"       
## [27] "compactness_worst"       "concavity_worst"        
## [29] "concave.points_worst"    "symmetry_worst"         
## [31] "fractal_dimension_worst"

          prop.table(table(train$diagnosis))

## 
##     B     M 
## 0.637 0.363

                #Model
                form = as.formula("diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean + 
                      compactness_mean +concavity_mean+concave.points_mean + symmetry_mean+ fractal_dimension_mean")
                
                form = as.formula("diagnosis ~ .")
                
                train$diagnosis = as.factor(train$diagnosis)
                mod_forest = randomForest(form,data = train, ntree = 200, mtry = 3)
                #mod_forest = randomForest(diagnosis~ radius_mean,data = train, ntree = 200, mtry = 3)
                mod_forest

## 
## Call:
##  randomForest(formula = form, data = train, ntree = 200, mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 3.08%
## Confusion matrix:
##     B   M class.error
## B 285   5      0.0172
## M   9 156      0.0545

                mod_forest_pred = predict(mod_forest, newdata = test)
                confusionMatrix(mod_forest_pred, test$diagnosis)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 63  4
##          M  4 43
##                                              
##                Accuracy : 0.93               
##                  95% CI : (0.866, 0.969)     
##     No Information Rate : 0.588              
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.855              
##                                              
##  Mcnemar's Test P-Value : 1                  
##                                              
##             Sensitivity : 0.940              
##             Specificity : 0.915              
##          Pos Pred Value : 0.940              
##          Neg Pred Value : 0.915              
##              Prevalence : 0.588              
##          Detection Rate : 0.553              
##    Detection Prevalence : 0.588              
##       Balanced Accuracy : 0.928              
##                                              
##        'Positive' Class : B                  
##

                importance(mod_forest) %>% as.data.frame() %>% 
                  rownames_to_column() %>% arrange(desc(MeanDecreaseGini))

##                    rowname MeanDecreaseGini
## 1             radius_worst           22.111
## 2               area_worst           20.220
## 3          perimeter_worst           19.960
## 4     concave.points_worst           18.188
## 5      concave.points_mean           17.133
## 6           perimeter_mean           14.923
## 7                  area_se           14.415
## 8              radius_mean           11.973
## 9           concavity_mean           10.096
## 10               area_mean            9.850
## 11         concavity_worst            9.312
## 12       compactness_worst            4.426
## 13               radius_se            4.340
## 14            perimeter_se            4.002
## 15           texture_worst            3.245
## 16          symmetry_worst            3.142
## 17        compactness_mean            3.035
## 18        smoothness_worst            2.657
## 19            texture_mean            2.407
## 20            concavity_se            2.303
## 21 fractal_dimension_worst            1.814
## 22       concave.points_se            1.542
## 23         smoothness_mean            1.517
## 24             symmetry_se            1.251
## 25          compactness_se            1.219
## 26              texture_se            1.159
## 27    fractal_dimension_se            1.141
## 28  fractal_dimension_mean            0.971
## 29           smoothness_se            0.956
## 30           symmetry_mean            0.759

                var_importance = importance(mod_forest) %>% as.data.frame() %>% 
                  rownames_to_column() %>% 
                    arrange( desc(MeanDecreaseGini))
                      
          
                
      ggplot(var_importance, aes(x = reorder(rowname, MeanDecreaseGini), y = MeanDecreaseGini, fill = rowname)) +
        geom_bar(stat = "identity") +
          ggtitle("Variable Importance from Random Forest Model") +
            xlab("Predictors") + ylab("Variable Importance (Mean Decrease in Gini Index)") +
              #scale_fill_discrete(name="Predictor") +
                coord_flip()+
        theme(legend.position = "none")

      ##############################
      
      insurance = read.csv("caravan-insurance-challenge.csv", header = T,
                        stringsAsFactors = T)
      head(insurance)

##   id MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE
## 1  1      33        1       3        2        8      0      5      1      3
## 2  2      37        1       2        2        8      1      4      1      4
## 3  3      37        1       2        2        8      0      4      2      4
## 4  4       9        1       3        3        3      2      3      2      4
## 5  5      40        1       4        2       10      1      4      1      4
## 6  6      23        1       2        1        5      0      5      0      5
##   MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG
## 1      7      0      2        1        2        6        1        2        7
## 2      6      2      2        0        4        5        0        5        4
## 3      3      2      4        4        4        2        0        5        4
## 4      5      2      2        2        3        4        3        4        2
## 5      7      1      2        2        4        4        5        4        0
## 6      0      6      3        3        5        2        0        5        4
##   MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA MSKB1 MSKB2 MSKC
## 1        1        0        1        2        5        2    1     1     2    6
## 2        0        0        0        5        0        4    0     2     3    5
## 3        0        0        0        7        0        2    0     5     0    4
## 4        4        0        0        3        1        2    3     2     1    4
## 5        0        5        4        0        0        0    9     0     0    0
## 6        2        0        0        4        2        2    2     2     2    4
##   MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART MINKM30 MINK3045 MINK4575
## 1    1      1      8     8     0     1       8      1       0        4        5
## 2    0      2      7     7     1     2       6      3       2        0        5
## 3    0      7      2     7     0     2       9      0       4        5        0
## 4    0      5      4     9     0     0       7      2       1        5        3
## 5    0      4      5     6     2     1       5      4       0        0        9
## 6    2      9      0     5     3     3       9      0       5        2        3
##   MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART PWABEDR PWALAND PPERSAUT PBESAUT
## 1        0        0       4        3       0       0       0        6       0
## 2        2        0       5        4       2       0       0        0       0
## 3        0        0       3        4       2       0       0        6       0
## 4        0        0       4        4       0       0       0        6       0
## 5        0        0       6        3       0       0       0        0       0
## 6        0        0       3        3       0       0       0        6       0
##   PMOTSCO PVRAAUT PAANHANG PTRACTOR PWERKT PBROM PLEVEN PPERSONG PGEZONG
## 1       0       0        0        0      0     0      0        0       0
## 2       0       0        0        0      0     0      0        0       0
## 3       0       0        0        0      0     0      0        0       0
## 4       0       0        0        0      0     0      0        0       0
## 5       0       0        0        0      0     0      0        0       0
## 6       0       0        0        0      0     0      0        0       0
##   PWAOREG PBRAND PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND AWAPART AWABEDR
## 1       0      5       0        0      0       0        0       0       0
## 2       0      2       0        0      0       0        0       2       0
## 3       0      2       0        0      0       0        0       1       0
## 4       0      2       0        0      0       0        0       0       0
## 5       0      6       0        0      0       0        0       0       0
## 6       0      0       0        0      0       0        0       0       0
##   AWALAND APERSAUT ABESAUT AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT ABROM
## 1       0        1       0       0       0        0        0      0     0
## 2       0        0       0       0       0        0        0      0     0
## 3       0        1       0       0       0        0        0      0     0
## 4       0        1       0       0       0        0        0      0     0
## 5       0        0       0       0       0        0        0      0     0
## 6       0        1       0       0       0        0        0      0     0
##   ALEVEN APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED
## 1      0        0       0       0      1       0        0      0       0
## 2      0        0       0       0      1       0        0      0       0
## 3      0        0       0       0      1       0        0      0       0
## 4      0        0       0       0      1       0        0      0       0
## 5      0        0       0       0      1       0        0      0       0
## 6      0        0       0       0      0       0        0      0       0
##   ABYSTAND CARAVAN
## 1        0       0
## 2        0       0
## 3        0       0
## 4        0       0
## 5        0       0
## 6        0       0

      set.seed(12345)
      
      #Create training and testing subsets
      insurance_df = insurance %>% dplyr::mutate( ID = row_number())
      train = insurance_df %>% sample_frac(0.6)
      test = insurance_df %>% anti_join(train, by = "ID")
      
      test = test %>% select(-ID, -id)
      train = train %>% select(-ID, -id)
      
      #Establish Null Model
      names(train)

##  [1] "MOSTYPE"  "MAANTHUI" "MGEMOMV"  "MGEMLEEF" "MOSHOOFD" "MGODRK"  
##  [7] "MGODPR"   "MGODOV"   "MGODGE"   "MRELGE"   "MRELSA"   "MRELOV"  
## [13] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [19] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [25] "MSKA"     "MSKB1"    "MSKB2"    "MSKC"     "MSKD"     "MHHUUR"  
## [31] "MHKOOP"   "MAUT1"    "MAUT2"    "MAUT0"    "MZFONDS"  "MZPART"  
## [37] "MINKM30"  "MINK3045" "MINK4575" "MINK7512" "MINK123M" "MINKGEM" 
## [43] "MKOOPKLA" "PWAPART"  "PWABEDR"  "PWALAND"  "PPERSAUT" "PBESAUT" 
## [49] "PMOTSCO"  "PVRAAUT"  "PAANHANG" "PTRACTOR" "PWERKT"   "PBROM"   
## [55] "PLEVEN"   "PPERSONG" "PGEZONG"  "PWAOREG"  "PBRAND"   "PZEILPL" 
## [61] "PPLEZIER" "PFIETS"   "PINBOED"  "PBYSTAND" "AWAPART"  "AWABEDR" 
## [67] "AWALAND"  "APERSAUT" "ABESAUT"  "AMOTSCO"  "AVRAAUT"  "AAANHANG"
## [73] "ATRACTOR" "AWERKT"   "ABROM"    "ALEVEN"   "APERSONG" "AGEZONG" 
## [79] "AWAOREG"  "ABRAND"   "AZEILPL"  "APLEZIER" "AFIETS"   "AINBOED" 
## [85] "ABYSTAND" "CARAVAN"

      prop.table(table(train$CARAVAN))

## 
##      0      1 
## 0.9403 0.0597

      #Model
      form = as.formula("CARAVAN ~ .")
      
      train$CARAVAN = as.factor(train$CARAVAN)
      mod_forest = randomForest(form,data = train, ntree = 200, mtry = 5)
      mod_forest

## 
## Call:
##  randomForest(formula = form, data = train, ntree = 200, mtry = 5) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 6.28%
## Confusion matrix:
##      0  1 class.error
## 0 5512 29     0.00523
## 1  341 11     0.96875

      #mod_forest_pred = predict(mod_forest, newdata = test)
      #confusionMatrix(mod_forest_pred, test$CARAVAN)
      
      importance(mod_forest) %>% as.data.frame() %>% 
        rownames_to_column() %>% arrange(desc(MeanDecreaseGini))

##     rowname MeanDecreaseGini
## 1    PBRAND        17.220113
## 2   MOSTYPE        15.788165
## 3  PPERSAUT        14.478206
## 4  APERSAUT        13.193383
## 5  MKOOPKLA        12.441978
## 6  MBERMIDD        11.587130
## 7  MOSHOOFD        11.038368
## 8  MOPLLAAG        10.963097
## 9  MOPLMIDD        10.873277
## 10   MGODPR        10.435422
## 11 MFWEKIND        10.300690
## 12 MINK3045        10.236031
## 13 MBERARBG         9.874873
## 14   MGODGE         9.870187
## 15 MFGEKIND         9.788908
## 16 MBERARBO         9.537983
## 17     MSKC         9.498346
## 18 MINK4575         9.458944
## 19  MINKM30         9.401708
## 20   MHKOOP         9.380601
## 21    MSKB1         9.338888
## 22   MHHUUR         9.336127
## 23   MZPART         9.210257
## 24  PWAPART         9.192303
## 25 MBERHOOG         9.130984
## 26    MAUT1         9.079483
## 27 MINK7512         9.018692
## 28    MSKB2         8.888134
## 29 MOPLHOOG         8.810627
## 30   MRELGE         8.794985
## 31  MINKGEM         8.707201
## 32     MSKA         8.703573
## 33   MRELOV         8.571931
## 34  MZFONDS         8.556517
## 35    MAUT0         8.242629
## 36 MFALLEEN         8.173773
## 37   MGODOV         7.810161
## 38    MAUT2         7.685528
## 39   ABRAND         6.918440
## 40   MGODRK         6.619712
## 41  AWAPART         6.358411
## 42 MGEMLEEF         6.323388
## 43  MGEMOMV         6.041243
## 44     MSKD         6.029186
## 45 MBERZELF         5.669090
## 46   MRELSA         5.512110
## 47   ALEVEN         5.413133
## 48 APLEZIER         4.733942
## 49   PLEVEN         4.418790
## 50 MBERBOER         4.296431
## 51 PPLEZIER         3.748156
## 52    PBROM         3.423410
## 53  PGEZONG         3.318771
## 54   AFIETS         3.306825
## 55  AMOTSCO         2.993325
## 56 MINK123M         2.956958
## 57    ABROM         2.812934
## 58  PMOTSCO         2.727845
## 59  AGEZONG         2.503577
## 60 MAANTHUI         2.061797
## 61   PFIETS         2.049035
## 62 PTRACTOR         1.964685
## 63 PBYSTAND         1.821742
## 64 ABYSTAND         1.653674
## 65  PWABEDR         1.393949
## 66  AWAOREG         1.225636
## 67  AWABEDR         1.155639
## 68 ATRACTOR         1.111624
## 69  AINBOED         1.098447
## 70  PINBOED         1.066826
## 71  PWAOREG         1.048101
## 72 AAANHANG         0.838060
## 73  ABESAUT         0.833928
## 74 PAANHANG         0.774200
## 75  PBESAUT         0.675029
## 76 PPERSONG         0.538370
## 77 APERSONG         0.520351
## 78  PWALAND         0.422764
## 79  AWALAND         0.332298
## 80  AZEILPL         0.197461
## 81  PZEILPL         0.128503
## 82   PWERKT         0.031075
## 83   AWERKT         0.020037
## 84  PVRAAUT         0.000391
## 85  AVRAAUT         0.000226

      var_importance = importance(mod_forest) %>% as.data.frame() %>% 
        rownames_to_column() %>% 
        arrange( desc(MeanDecreaseGini))
      
      
      
      ggplot(var_importance, aes(x = reorder(rowname, MeanDecreaseGini), y = MeanDecreaseGini, fill = rowname)) +
        geom_bar(stat = "identity") +
        ggtitle("Variable Importance from Random Forest Model") +
        xlab("Predictors") + ylab("Variable Importance (Mean Decrease in Gini Index)") +
        #scale_fill_discrete(name="Predictor") +
        coord_flip()+
        theme(legend.position = "none")

          #=========================================================================================
                
      #Testing on credit:
      
      credit = read.csv("UniversalBank.csv", header = T,
                        stringsAsFactors = T)
      head(credit)

##   ID Age Experience Income ZIP.Code Family CCAvg Education Mortgage
## 1  1  25          1     49    91107      4   1.6         1        0
## 2  2  45         19     34    90089      3   1.5         1        0
## 3  3  39         15     11    94720      1   1.0         1        0
## 4  4  35          9    100    94112      1   2.7         2        0
## 5  5  35          8     45    91330      4   1.0         2        0
## 6  6  37         13     29    92121      4   0.4         2      155
##   PersonalLoan SecuritiesAccount CDAccount Online CreditCard
## 1            0                 1         0      0          0
## 2            0                 1         0      0          0
## 3            0                 0         0      0          0
## 4            0                 0         0      0          0
## 5            0                 0         0      0          1
## 6            0                 0         0      1          0

      set.seed(12345)
      
      #Create training and testing subsets
      credit_df = credit %>% dplyr::mutate( ID = row_number())
      train = credit_df %>% sample_frac(0.8)
      test = credit_df %>% anti_join(train, by = "ID")
      
      test = test %>% select(-ID)
      train = train %>% select(-ID)
      
      #Establish Null Model
      names(train)

##  [1] "Age"               "Experience"        "Income"           
##  [4] "ZIP.Code"          "Family"            "CCAvg"            
##  [7] "Education"         "Mortgage"          "PersonalLoan"     
## [10] "SecuritiesAccount" "CDAccount"         "Online"           
## [13] "CreditCard"

      prop.table(table(train$PersonalLoan))

## 
##      0      1 
## 0.9032 0.0968

      #Model
      form = as.formula("PersonalLoan ~ Age + Experience + Income + ZIP.Code + Family + CCAvg + 
                        Education + Mortgage + CreditCard + SecuritiesAccount + CDAccount + Online")
      
      train$PersonalLoan = as.factor(train$PersonalLoan)
      mod_forest = randomForest(form,data = train, ntree = 200, mtry = 3)
      #mod_forest = randomForest(diagnosis~ radius_mean,data = train, ntree = 200, mtry = 3)
      mod_forest

## 
## Call:
##  randomForest(formula = form, data = train, ntree = 200, mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.3%
## Confusion matrix:
##      0   1 class.error
## 0 3605   8     0.00221
## 1   44 343     0.11370

      mod_forest_pred = predict(mod_forest, newdata = test)
      mod_forest_pred

##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64 
##    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0 
##   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80 
##    0    0    0    0    0    0    1    1    1    0    0    0    0    0    0    0 
##   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96 
##    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0 
##   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
##    0    0    0    0    0    0    1    0    0    0    0    0    0    1    0    0 
##  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128 
##    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0 
##  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144 
##    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0 
##  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160 
##    0    0    0    0    0    0    0    0    0    0    1    0    0    0    1    0 
##  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    1 
##  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192 
##    0    0    0    1    0    0    0    0    0    0    1    0    0    0    0    0 
##  193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208 
##    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  209  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0 
##  225  226  227  228  229  230  231  232  233  234  235  236  237  238  239  240 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0 
##  241  242  243  244  245  246  247  248  249  250  251  252  253  254  255  256 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  257  258  259  260  261  262  263  264  265  266  267  268  269  270  271  272 
##    0    0    0    1    1    0    0    0    0    0    0    0    0    0    0    0 
##  273  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288 
##    0    0    0    1    0    1    0    0    0    0    0    0    0    0    0    0 
##  289  290  291  292  293  294  295  296  297  298  299  300  301  302  303  304 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0 
##  305  306  307  308  309  310  311  312  313  314  315  316  317  318  319  320 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1 
##  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352 
##    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0 
##  353  354  355  356  357  358  359  360  361  362  363  364  365  366  367  368 
##    1    1    0    0    0    0    0    0    0    0    0    0    0    0    1    0 
##  369  370  371  372  373  374  375  376  377  378  379  380  381  382  383  384 
##    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0 
##  385  386  387  388  389  390  391  392  393  394  395  396  397  398  399  400 
##    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  401  402  403  404  405  406  407  408  409  410  411  412  413  414  415  416 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0 
##  417  418  419  420  421  422  423  424  425  426  427  428  429  430  431  432 
##    1    0    0    1    0    0    1    0    0    0    0    0    0    0    0    0 
##  433  434  435  436  437  438  439  440  441  442  443  444  445  446  447  448 
##    0    1    0    0    0    0    0    0    0    0    1    0    0    0    0    0 
##  449  450  451  452  453  454  455  456  457  458  459  460  461  462  463  464 
##    1    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0 
##  465  466  467  468  469  470  471  472  473  474  475  476  477  478  479  480 
##    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0 
##  481  482  483  484  485  486  487  488  489  490  491  492  493  494  495  496 
##    0    0    0    0    0    0    0    0    1    1    0    0    0    0    0    0 
##  497  498  499  500  501  502  503  504  505  506  507  508  509  510  511  512 
##    1    1    0    0    0    0    0    0    0    0    1    0    0    0    0    0 
##  513  514  515  516  517  518  519  520  521  522  523  524  525  526  527  528 
##    0    0    0    0    0    0    0    0    0    0    1    0    0    0    1    0 
##  529  530  531  532  533  534  535  536  537  538  539  540  541  542  543  544 
##    0    0    0    0    0    0    0    0    1    0    0    1    0    0    0    0 
##  545  546  547  548  549  550  551  552  553  554  555  556  557  558  559  560 
##    1    0    1    0    0    1    0    0    0    0    1    1    0    0    0    0 
##  561  562  563  564  565  566  567  568  569  570  571  572  573  574  575  576 
##    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0 
##  577  578  579  580  581  582  583  584  585  586  587  588  589  590  591  592 
##    0    0    0    0    1    0    1    0    0    0    0    0    1    0    0    0 
##  593  594  595  596  597  598  599  600  601  602  603  604  605  606  607  608 
##    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  609  610  611  612  613  614  615  616  617  618  619  620  621  622  623  624 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  625  626  627  628  629  630  631  632  633  634  635  636  637  638  639  640 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  641  642  643  644  645  646  647  648  649  650  651  652  653  654  655  656 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0 
##  657  658  659  660  661  662  663  664  665  666  667  668  669  670  671  672 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  673  674  675  676  677  678  679  680  681  682  683  684  685  686  687  688 
##    0    1    0    1    1    0    0    0    0    0    1    0    0    0    1    0 
##  689  690  691  692  693  694  695  696  697  698  699  700  701  702  703  704 
##    0    1    0    0    1    0    0    0    0    0    0    0    0    0    0    0 
##  705  706  707  708  709  710  711  712  713  714  715  716  717  718  719  720 
##    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0 
##  721  722  723  724  725  726  727  728  729  730  731  732  733  734  735  736 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  737  738  739  740  741  742  743  744  745  746  747  748  749  750  751  752 
##    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  753  754  755  756  757  758  759  760  761  762  763  764  765  766  767  768 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0 
##  769  770  771  772  773  774  775  776  777  778  779  780  781  782  783  784 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  785  786  787  788  789  790  791  792  793  794  795  796  797  798  799  800 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0 
##  801  802  803  804  805  806  807  808  809  810  811  812  813  814  815  816 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    1    1 
##  817  818  819  820  821  822  823  824  825  826  827  828  829  830  831  832 
##    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  833  834  835  836  837  838  839  840  841  842  843  844  845  846  847  848 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0 
##  849  850  851  852  853  854  855  856  857  858  859  860  861  862  863  864 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0 
##  865  866  867  868  869  870  871  872  873  874  875  876  877  878  879  880 
##    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  881  882  883  884  885  886  887  888  889  890  891  892  893  894  895  896 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  897  898  899  900  901  902  903  904  905  906  907  908  909  910  911  912 
##    0    0    0    1    0    1    0    0    0    0    0    0    0    0    0    1 
##  913  914  915  916  917  918  919  920  921  922  923  924  925  926  927  928 
##    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0 
##  929  930  931  932  933  934  935  936  937  938  939  940  941  942  943  944 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  945  946  947  948  949  950  951  952  953  954  955  956  957  958  959  960 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  961  962  963  964  965  966  967  968  969  970  971  972  973  974  975  976 
##    0    0    0    0    0    0    0    1    0    0    0    1    0    0    0    0 
##  977  978  979  980  981  982  983  984  985  986  987  988  989  990  991  992 
##    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0 
##  993  994  995  996  997  998  999 1000 
##    0    1    0    0    0    0    0    0 
## Levels: 0 1

      #confusionMatrix(mod_forest_pred, test$PersonalLoan)
      
      importance(mod_forest) %>% as.data.frame() %>% 
        rownames_to_column() %>% arrange(desc(MeanDecreaseGini))

##              rowname MeanDecreaseGini
## 1             Income           233.71
## 2          Education           139.92
## 3              CCAvg           109.90
## 4             Family            71.31
## 5          CDAccount            37.91
## 6                Age            23.49
## 7           ZIP.Code            21.80
## 8         Experience            21.71
## 9           Mortgage            21.03
## 10        CreditCard             5.44
## 11            Online             4.49
## 12 SecuritiesAccount             2.83

      var_importance = importance(mod_forest) %>% as.data.frame() %>% 
        rownames_to_column() %>% 
        arrange( desc(MeanDecreaseGini))
      
      
      
      ggplot(var_importance, aes(x = reorder(rowname, MeanDecreaseGini), y = MeanDecreaseGini, fill = rowname)) +
        geom_bar(stat = "identity") +
        ggtitle("Variable Importance from Random Forest Model") +
        xlab("Predictors") + ylab("Variable Importance (Mean Decrease in Gini Index)") +
        #scale_fill_discrete(name="Predictor") +
        coord_flip()+
        theme(legend.position = "none")

      newcredit = read.csv("newCreditApplicants.csv", header =  TRUE, stringsAsFactors = F)
      str(newcredit)

## 'data.frame':    20 obs. of  13 variables:
##  $ ID               : int  3280 309 1544 3076 1989 3751 2957 1586 4053 1604 ...
##  $ Age              : int  26 32 52 26 52 57 62 57 43 36 ...
##  $ Experience       : int  -1 8 26 0 28 32 38 31 19 6 ...
##  $ Income           : int  44 128 101 85 18 52 195 131 54 138 ...
##  $ ZIP.Code         : int  94901 94720 93407 95616 91301 90266 91125 90502 94608 92152 ...
##  $ Family           : int  1 2 2 2 1 3 4 2 2 1 ...
##  $ CCAvg            : num  2 4.33 2.4 1.6 0.3 0.5 5.2 2.7 1.7 7 ...
##  $ Education        : int  2 1 2 3 1 2 3 1 1 3 ...
##  $ Mortgage         : int  0 0 0 0 120 0 522 0 0 86 ...
##  $ SecuritiesAccount: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CDAccount        : int  0 1 0 0 0 0 1 0 0 0 ...
##  $ Online           : int  0 1 1 0 1 1 1 0 1 1 ...
##  $ CreditCard       : int  0 1 0 0 0 0 1 0 0 0 ...

      newcredit_df = newcredit %>% dplyr::mutate( ID = row_number())
      mod_forest_pred2 = predict(mod_forest, newdata = newcredit_df)         
      mod_forest_pred2

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  0  0  0  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0  0 
## Levels: 0 1

      #confusionMatrix(mod_forest_pred2, test$PersonalLoan)

Random_Forests

2023-02-16