NOTE Clear the environment

rm(list = ls(all=TRUE))

Goal

  • Based on various financial ratios , predict whether the company got bankrupt in the subsequent years or not.

Agenda

  • Get the data

  • Data Pre-processing

  • Build a model

  • Predictions

  • Communication

Libraries used

library(ROSE)
## Loaded ROSE 0.0-3
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(C50)
library(rpart)
library(rpart.plot)
library(DMwR)
## Loading required package: grid
library(class)
library(mice)
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(vegan)
## Loading required package: permute
## This is vegan 2.5-2
## 
## Attaching package: 'vegan'
## The following object is masked from 'package:caret':
## 
##     tolerance
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(inTrees)
library(e1071)

Reading & Understanding the Data

Read the Data

setwd("C://Users//brbhatta//Desktop//INSOFE//Cute3")
bank_data <- read.csv("train.csv")

Understand the data

  • Using the str(), summary(), head() and tail() functions to get the dimensions and types of attributes in the dataset

  • The dataset has 43004 observations and 65 variables

str(bank_data)
## 'data.frame':    36553 obs. of  66 variables:
##  $ ID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Attr1 : num  0.13537 0.00586 0.1106 0.06391 0.13168 ...
##  $ Attr2 : num  0.452 0.399 0.161 1.407 0.66 ...
##  $ Attr3 : num  0.312 0.198 0.479 -0.296 0.441 ...
##  $ Attr4 : num  2.047 1.939 7.571 0.529 2.71 ...
##  $ Attr5 : num  10.23 9.58 263.9 -46.29 -23.6 ...
##  $ Attr6 : num  0.168 0 0 -0.714 -0.2 ...
##  $ Attr7 : num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr8 : num  1.213 1.509 5.205 -0.331 0.514 ...
##  $ Attr9 : num  2.255 0.979 0.684 0.985 2.136 ...
##  $ Attr10: num  0.548 0.601 0.839 -0.465 0.34 ...
##  $ Attr11: num  0.1833 0.0295 0.1388 0.0791 0.1861 ...
##  $ Attr12: num  0.5632 0.0344 1.8983 0.1258 0.5112 ...
##  $ Attr13: num  0.0892 0.0364 0.2362 0.0303 0.1671 ...
##  $ Attr14: num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr15: num  820 4088 364 5433 675 ...
##  $ Attr16: num  0.4453 0.0893 1.0032 0.0672 0.5405 ...
##  $ Attr17: num  2.213 2.509 6.205 0.711 1.514 ...
##  $ Attr18: num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr19: num  0.07433 0.00739 0.20214 0.02532 0.06176 ...
##  $ Attr20: num  40.2 58.7 38.6 12.5 80.3 ...
##  $ Attr21: num  NA 0.979 1.835 0.449 1.06 ...
##  $ Attr22: num  0.18115 0.00602 0.11669 0.13787 0.18272 ...
##  $ Attr23: num  0.06002 0.00599 0.16159 0.02047 0.06166 ...
##  $ Attr24: num  0.1676 0.0578 0.2354 -0.6893 0.2086 ...
##  $ Attr25: num  0.5115 0.5474 0.8349 -0.4655 0.0482 ...
##  $ Attr26: num  0.3739 0.0858 0.8309 0.0564 0.5401 ...
##  $ Attr27: num  11.554 0.271 249.3 0.435 3.371 ...
##  $ Attr28: num  0.798 0.334 1.069 -0.443 1.467 ...
##  $ Attr29: num  3.58 5.64 4.1 4.66 2.23 ...
##  $ Attr30: num  0.156 0.407 -0.163 0.45 0.306 ...
##  $ Attr31: num  0.07999 0.00739 0.20283 0.02532 0.07901 ...
##  $ Attr32: num  52.3 107.5 46.3 72.3 49.6 ...
##  $ Attr33: num  6.98 3.69 7.89 5.04 7.36 ...
##  $ Attr34: num  4.598 1.95 3.568 0.098 2.878 ...
##  $ Attr35: num  0.178 0.264 0.109 0.138 0.235 ...
##  $ Attr36: num  2.255 0.979 0.684 3.501 2.136 ...
##  $ Attr37: num  3.59 1.33 5.58 0.29 0.57 ...
##  $ Attr38: num  0.649 0.789 0.925 0.313 0.742 ...
##  $ Attr39: num  0.0787 0.2697 0.1599 0.0442 0.1102 ...
##  $ Attr40: num  0.35559 0.1049 3.7693 0.00798 0.34263 ...
##  $ Attr41: num  0.0701 0.3865 0.0384 0.3017 0.054 ...
##  $ Attr42: num  0.08032 0.00615 0.17048 0.04415 0.08556 ...
##  $ Attr43: num  81.5 144 147.8 38.3 104.4 ...
##  $ Attr44: num  41.3 85.3 109.1 25.8 24.1 ...
##  $ Attr45: num  0.5455 0.0373 1.527 0.599 0.2803 ...
##  $ Attr46: num  1.213 1.192 6.577 0.359 0.889 ...
##  $ Attr47: num  43.6 73.9 46 12.3 90.2 ...
##  $ Attr48: num  0.1475 -0.0223 0.0934 0.1224 -0.0422 ...
##  $ Attr49: num  0.0654 -0.0228 0.1364 0.0392 -0.0198 ...
##  $ Attr50: num  1.348 1.024 3.424 0.236 1.059 ...
##  $ Attr51: num  0.2977 0.2105 0.0729 0.6285 0.2581 ...
##  $ Attr52: num  0.143 0.271 0.127 0.198 0.136 ...
##  $ Attr53: num  1.403 1.016 1.872 -0.697 1.129 ...
##  $ Attr54: num  1.661 1.334 2.063 0.469 2.467 ...
##  $ Attr55: num  1189.7 1.94 6092.3 -13606 75.68 ...
##  $ Attr56: num  0.0787 0.2697 0.1599 -0.0153 0.1102 ...
##  $ Attr57: num  0.247 0 0.132 -0.137 0.388 ...
##  $ Attr58: num  0.926 0.793 0.807 1.015 0.939 ...
##  $ Attr59: num  0.184 0.313 0.102 -1.673 1.184 ...
##  $ Attr60: num  9.09 6.22 9.45 29.27 4.55 ...
##  $ Attr61: num  8.83 4.28 3.34 14.14 15.14 ...
##  $ Attr62: num  48.2 78.5 38.9 73.5 44.1 ...
##  $ Attr63: num  7.58 4.65 9.39 4.97 8.28 ...
##  $ Attr64: num  5.77 1.65 1.53 4.68 7.1 ...
##  $ target: int  0 0 0 0 0 0 0 0 0 0 ...
summary(bank_data)
##        ID            Attr1                Attr2          
##  Min.   :    1   Min.   :-256.89000   Min.   :-430.8700  
##  1st Qu.: 9139   1st Qu.:   0.00334   1st Qu.:   0.2696  
##  Median :18277   Median :   0.04960   Median :   0.4726  
##  Mean   :18277   Mean   :   0.05146   Mean   :   0.5777  
##  3rd Qu.:27415   3rd Qu.:   0.13008   3rd Qu.:   0.6892  
##  Max.   :36553   Max.   :  94.28000   Max.   : 480.9600  
##                  NA's   :8            NA's   :8          
##      Attr3               Attr4              Attr5          
##  Min.   :-479.9600   Min.   :   -0.05   Min.   :-11903000  
##  1st Qu.:   0.0221   1st Qu.:    1.05   1st Qu.:      -49  
##  Median :   0.1975   Median :    1.57   Median :       -1  
##  Mean   :   0.1274   Mean   :    6.84   Mean   :     -482  
##  3rd Qu.:   0.4048   3rd Qu.:    2.79   3rd Qu.:       51  
##  Max.   :  28.3360   Max.   :53433.00   Max.   :  1034100  
##  NA's   :8           NA's   :111        NA's   :76         
##      Attr6               Attr7               Attr8         
##  Min.   :-508.4100   Min.   :-517.4800   Min.   : -141.41  
##  1st Qu.:   0.0000   1st Qu.:   0.0057   1st Qu.:    0.43  
##  Median :   0.0000   Median :   0.0595   Median :    1.07  
##  Mean   :  -0.0309   Mean   :   0.1151   Mean   :   12.91  
##  3rd Qu.:   0.0869   3rd Qu.:   0.1517   3rd Qu.:    2.61  
##  Max.   : 543.2500   Max.   : 649.2300   Max.   :53432.00  
##  NA's   :8           NA's   :8           NA's   :79        
##      Attr9              Attr10              Attr11         
##  Min.   :  -3.496   Min.   :-479.9100   Min.   :-189.4500  
##  1st Qu.:   1.019   1st Qu.:   0.2948   1st Qu.:   0.0153  
##  Median :   1.200   Median :   0.5056   Median :   0.0754  
##  Mean   :   2.675   Mean   :   0.6496   Mean   :   0.1549  
##  3rd Qu.:   2.063   3rd Qu.:   0.7084   3rd Qu.:   0.1678  
##  Max.   :9742.300   Max.   :1099.5000   Max.   : 681.5400  
##  NA's   :7          NA's   :8           NA's   :37         
##      Attr12              Attr13              Attr14         
##  Min.   :-6331.800   Min.   :-1317.600   Min.   :-517.4800  
##  1st Qu.:    0.015   1st Qu.:    0.024   1st Qu.:   0.0057  
##  Median :    0.171   Median :    0.068   Median :   0.0596  
##  Mean   :    1.182   Mean   :    0.956   Mean   :   0.1152  
##  3rd Qu.:    0.586   3rd Qu.:    0.135   3rd Qu.:   0.1517  
##  Max.   : 8259.400   Max.   :13315.000   Max.   : 649.2300  
##  NA's   :111         NA's   :113         NA's   :8          
##      Attr15             Attr16              Attr17        
##  Min.   :-5611900   Min.   :-6331.800   Min.   :   -0.41  
##  1st Qu.:     220   1st Qu.:    0.073   1st Qu.:    1.45  
##  Median :     845   Median :    0.245   Median :    2.11  
##  Mean   :    2405   Mean   :    1.519   Mean   :   14.08  
##  3rd Qu.:    2222   3rd Qu.:    0.665   3rd Qu.:    3.69  
##  Max.   :10236000   Max.   : 8259.400   Max.   :53433.00  
##  NA's   :28         NA's   :80          NA's   :79        
##      Attr18              Attr19              Attr20       
##  Min.   :-517.4800   Min.   :-1325.600   Min.   :    -29  
##  1st Qu.:   0.0057   1st Qu.:    0.004   1st Qu.:     15  
##  Median :   0.0596   Median :    0.036   Median :     35  
##  Mean   :   0.1212   Mean   :    0.192   Mean   :    278  
##  3rd Qu.:   0.1517   3rd Qu.:    0.091   3rd Qu.:     64  
##  Max.   : 649.2300   Max.   : 9230.500   Max.   :7809200  
##  NA's   :8           NA's   :114         NA's   :113      
##      Attr21              Attr22              Attr23         
##  Min.   :-1325.000   Min.   :-216.8000   Min.   :-1325.600  
##  1st Qu.:    0.908   1st Qu.:   0.0000   1st Qu.:    0.002  
##  Median :    1.045   Median :   0.0622   Median :    0.030  
##  Mean   :    4.035   Mean   :   0.1356   Mean   :    0.179  
##  3rd Qu.:    1.204   3rd Qu.:   0.1507   3rd Qu.:    0.078  
##  Max.   :29907.000   Max.   : 681.5400   Max.   : 9230.500  
##  NA's   :4983        NA's   :8           NA's   :113        
##      Attr24              Attr25              Attr26         
##  Min.   :-314.3700   Min.   :-500.9300   Min.   :-6331.800  
##  1st Qu.:   0.0212   1st Qu.:   0.1478   1st Qu.:    0.066  
##  Median :   0.1550   Median :   0.3840   Median :    0.221  
##  Mean   :   0.3000   Mean   :   0.3875   Mean   :    1.360  
##  3rd Qu.:   0.3559   3rd Qu.:   0.6098   3rd Qu.:    0.598  
##  Max.   : 831.6600   Max.   :1353.3000   Max.   : 8262.300  
##  NA's   :794         NA's   :8           NA's   :80         
##      Attr27            Attr28              Attr29       
##  Min.   :-259010   Min.   :-3829.900   Min.   :-0.8861  
##  1st Qu.:      0   1st Qu.:    0.040   1st Qu.: 3.4859  
##  Median :      1   Median :    0.468   Median : 4.0033  
##  Mean   :   1188   Mean   :    5.640   Mean   : 3.9963  
##  3rd Qu.:      5   3rd Qu.:    1.511   3rd Qu.: 4.5125  
##  Max.   :4208800   Max.   :21701.000   Max.   : 9.6983  
##  NA's   :2343      NA's   :694         NA's   :8        
##      Attr30              Attr31              Attr32        
##  Min.   : -4940.00   Min.   :-1325.600   Min.   :   -9296  
##  1st Qu.:     0.08   1st Qu.:    0.007   1st Qu.:      46  
##  Median :     0.22   Median :    0.043   Median :      78  
##  Mean   :     8.78   Mean   :    0.215   Mean   :     926  
##  3rd Qu.:     0.41   3rd Qu.:    0.102   3rd Qu.:     128  
##  Max.   :152860.00   Max.   : 9244.300   Max.   :17364000  
##  NA's   :113         NA's   :113         NA's   :312       
##      Attr33              Attr34              Attr35         
##  Min.   :   -7.235   Min.   :-1696.000   Min.   :-169.4700  
##  1st Qu.:    2.821   1st Qu.:    0.312   1st Qu.:   0.0058  
##  Median :    4.621   Median :    1.978   Median :   0.0606  
##  Mean   :    8.743   Mean   :    5.521   Mean   :   0.1286  
##  3rd Qu.:    7.816   3rd Qu.:    4.561   3rd Qu.:   0.1509  
##  Max.   :21944.000   Max.   :21944.000   Max.   : 626.9200  
##  NA's   :111         NA's   :79          NA's   :8          
##      Attr36             Attr37             Attr38         
##  Min.   :  -0.001   Min.   :  -525.5   Min.   :-479.9100  
##  1st Qu.:   1.104   1st Qu.:     1.1   1st Qu.:   0.4192  
##  Median :   1.646   Median :     3.1   Median :   0.6130  
##  Mean   :   2.907   Mean   :   110.8   Mean   :   0.7483  
##  3rd Qu.:   2.425   3rd Qu.:    11.5   3rd Qu.:   0.7722  
##  Max.   :9742.300   Max.   :398920.0   Max.   :1099.5000  
##  NA's   :8          NA's   :16093      NA's   :8          
##      Attr39              Attr40             Attr41         
##  Min.   :-7522.000   Min.   :-101.270   Min.   : -1234.40  
##  1st Qu.:    0.004   1st Qu.:   0.052   1st Qu.:     0.03  
##  Median :    0.037   Median :   0.176   Median :     0.09  
##  Mean   :   -0.328   Mean   :   2.268   Mean   :     8.96  
##  3rd Qu.:    0.092   3rd Qu.:   0.653   3rd Qu.:     0.20  
##  Max.   : 2156.500   Max.   :8007.100   Max.   :288770.00  
##  NA's   :113         NA's   :111        NA's   :650        
##      Attr42               Attr43             Attr44        
##  Min.   :-1395.8000   Min.   : -115870   Min.   : -115870  
##  1st Qu.:    0.0000   1st Qu.:      67   1st Qu.:      35  
##  Median :    0.0379   Median :     100   Median :      55  
##  Mean   :   -0.1348   Mean   :    1153   Mean   :     875  
##  3rd Qu.:    0.0921   3rd Qu.:     141   3rd Qu.:      80  
##  Max.   : 2156.8000   Max.   :30393000   Max.   :22584000  
##  NA's   :113          NA's   :113        NA's   :113       
##      Attr45              Attr46             Attr47       
##  Min.   :-256230.0   Min.   : -101.26   Min.   :    -53  
##  1st Qu.:      0.0   1st Qu.:    0.61   1st Qu.:     16  
##  Median :      0.3   Median :    1.03   Median :     38  
##  Mean   :     16.4   Mean   :    5.94   Mean   :    281  
##  3rd Qu.:      1.0   3rd Qu.:    1.92   3rd Qu.:     71  
##  Max.   : 366030.0   Max.   :53433.00   Max.   :6084200  
##  NA's   :1829        NA's   :112        NA's   :252      
##      Attr48              Attr49              Attr50        
##  Min.   :-218.4200   Min.   :-9001.000   Min.   :   -0.05  
##  1st Qu.:  -0.0387   1st Qu.:   -0.027   1st Qu.:    0.78  
##  Median :   0.0184   Median :    0.011   Median :    1.22  
##  Mean   :   0.0507   Mean   :   -0.527   Mean   :    6.44  
##  3rd Qu.:   0.1082   3rd Qu.:    0.062   3rd Qu.:    2.21  
##  Max.   : 623.8500   Max.   :  107.680   Max.   :53433.00  
##  NA's   :9           NA's   :113         NA's   :79        
##      Attr51             Attr52             Attr53         
##  Min.   :  0.0000   Min.   :  -25.47   Min.   : -3828.90  
##  1st Qu.:  0.1899   1st Qu.:    0.13   1st Qu.:     0.69  
##  Median :  0.3405   Median :    0.21   Median :     1.21  
##  Mean   :  0.4719   Mean   :    4.63   Mean   :    26.45  
##  3rd Qu.:  0.5356   3rd Qu.:    0.35   3rd Qu.:     2.24  
##  Max.   :480.9600   Max.   :88433.00   Max.   :180440.00  
##  NA's   :8          NA's   :256        NA's   :694        
##      Attr54              Attr55             Attr56          
##  Min.   : -3828.90   Min.   :-1805200   Min.   :-1108300.0  
##  1st Qu.:     0.96   1st Qu.:      30   1st Qu.:       0.0  
##  Median :     1.38   Median :    1078   Median :       0.1  
##  Mean   :    27.18   Mean   :    7870   Mean   :     -31.1  
##  3rd Qu.:     2.39   3rd Qu.:    4945   3rd Qu.:       0.1  
##  Max.   :180440.00   Max.   : 6123700   Max.   :     112.0  
##  NA's   :694                            NA's   :113         
##      Attr57               Attr58              Attr59         
##  Min.   :-1667.3000   Min.   :   -198.7   Min.   : -327.970  
##  1st Qu.:    0.0147   1st Qu.:      0.9   1st Qu.:    0.000  
##  Median :    0.1205   Median :      1.0   Median :    0.006  
##  Mean   :   -0.0179   Mean   :     35.0   Mean   :    1.481  
##  3rd Qu.:    0.2863   3rd Qu.:      1.0   3rd Qu.:    0.235  
##  Max.   :  552.6400   Max.   :1108300.0   Max.   :23853.000  
##  NA's   :7            NA's   :76          NA's   :7          
##      Attr60            Attr61              Attr62        
##  Min.   :    -12   Min.   :   -12.66   Min.   :  -14965  
##  1st Qu.:      6   1st Qu.:     4.51   1st Qu.:      42  
##  Median :     10   Median :     6.63   Median :      71  
##  Mean   :    482   Mean   :    17.68   Mean   :    1784  
##  3rd Qu.:     20   3rd Qu.:    10.38   3rd Qu.:     117  
##  Max.   :4818700   Max.   :108000.00   Max.   :25016000  
##  NA's   :1833      NA's   :86          NA's   :113       
##      Attr63              Attr64              target       
##  Min.   :   -0.368   Min.   :    -3.73   Min.   :0.00000  
##  1st Qu.:    3.099   1st Qu.:     2.19   1st Qu.:0.00000  
##  Median :    5.079   Median :     4.31   Median :0.00000  
##  Mean   :    9.417   Mean   :    77.35   Mean   :0.04829  
##  3rd Qu.:    8.607   3rd Qu.:     9.79   3rd Qu.:0.00000  
##  Max.   :23454.000   Max.   :294770.00   Max.   :1.00000  
##  NA's   :111         NA's   :694
head(bank_data)
##   ID     Attr1    Attr2    Attr3   Attr4    Attr5    Attr6     Attr7
## 1  1 0.1353700 0.451850  0.31162  2.0469  10.2340  0.16768 0.1676300
## 2  2 0.0058613 0.398580  0.19768  1.9390   9.5771  0.00000 0.0072373
## 3  3 0.1106000 0.161170  0.47894  7.5711 263.9000  0.00000 0.1383600
## 4  4 0.0639110 1.407300 -0.29595  0.5291 -46.2870 -0.71420 0.0790710
## 5  5 0.1316800 0.660310  0.44121  2.7098 -23.5960 -0.20007 0.1319100
## 6  6 0.2541100 0.022149  0.69694 33.2270  86.6400  0.00000 0.2541100
##      Attr8   Attr9   Attr10   Attr11   Attr12   Attr13    Attr14   Attr15
## 1  1.21310 2.25540  0.54815 0.183310  0.56316 0.089220 0.1676300  819.600
## 2  1.50890 0.97880  0.60142 0.029484  0.03438 0.036362 0.0072373 4087.600
## 3  5.20450 0.68447  0.83883 0.138830  1.89830 0.236220 0.1383600  363.850
## 4 -0.33076 0.98490 -0.46548 0.079071  0.12581 0.030274 0.0790710 5433.400
## 5  0.51445 2.13570  0.33969 0.186110  0.51117 0.167100 0.1319100  675.350
## 6 44.14900 1.97160  0.97785 0.264670 11.75000 0.150150 0.2541100   27.309
##      Attr16   Attr17    Attr18    Attr19 Attr20  Attr21    Attr22
## 1  0.445340  2.21310 0.1676300 0.0743250 40.156      NA 0.1811500
## 2  0.089295  2.50890 0.0072373 0.0073941 58.670 0.97850 0.0060182
## 3  1.003200  6.20450 0.1383600 0.2021400 38.625 1.83520 0.1166900
## 4  0.067177  0.71058 0.0790710 0.0253210 12.470 0.44909 0.1378700
## 5  0.540460  1.51440 0.1319100 0.0617630 80.288 1.06000 0.1827200
## 6 13.366000 45.14900 0.2541100 0.1288800 53.382 1.33860 0.2590200
##      Attr23    Attr24    Attr25    Attr26    Attr27   Attr28 Attr29
## 1 0.0600190  0.167630  0.511480  0.373930  11.55400  0.79756 3.5818
## 2 0.0059882  0.057817  0.547400  0.085842   0.27052  0.33402 5.6431
## 3 0.1615900  0.235380  0.834900  0.830940 249.30000  1.06860 4.1045
## 4 0.0204660 -0.689290 -0.465480  0.056405   0.43483 -0.44339 4.6625
## 5 0.0616560  0.208580  0.048201  0.540110   3.37080  1.46710 2.2343
## 6 0.1288800  0.603570  0.835400 13.366000  24.51900  2.47640 3.6387
##      Attr30    Attr31   Attr32  Attr33    Attr34  Attr35  Attr36  Attr37
## 1  0.155790 0.0799860  52.2890  6.9805  4.598500 0.17756 2.25540 3.58820
## 2  0.407210 0.0073941 107.4900  3.6924  1.950100 0.26400 0.97880 1.33380
## 3 -0.162880 0.2028300  46.2630  7.8896  3.567900 0.10943 0.68447 5.58440
## 4  0.450030 0.0253210  72.3500  5.0450  0.097967 0.13787 3.50080 0.28998
## 5  0.305820 0.0790120  49.5620  7.3646  2.878100 0.23529 2.13570 0.57046
## 6 -0.019578 0.1289300   4.5636 79.9800 78.090000 0.24202 1.97160      NA
##    Attr38   Attr39    Attr40    Attr41    Attr42  Attr43  Attr44   Attr45
## 1 0.64880 0.078728 0.3555900 0.0701390 0.0803160  81.473  41.317 0.545540
## 2 0.78949 0.269720 0.1049000 0.3865400 0.0061486 143.980  85.309 0.037254
## 3 0.92467 0.159870 3.7693000 0.0383720 0.1704800 147.760 109.140 1.527000
## 4 0.31335 0.044150 0.0079827 0.3017400 0.0441500  38.281  25.811 0.599050
## 5 0.74195 0.110170 0.3426300 0.0539890 0.0855550 104.400  24.108 0.280300
## 6 0.97785 0.122750 2.9208000 0.0024532 0.1313700 121.330  67.949 0.881220
##     Attr46 Attr47    Attr48    Attr49   Attr50   Attr51   Attr52   Attr53
## 1  1.21330 43.588  0.147550  0.065421  1.34840 0.297670 0.143260  1.40290
## 2  1.19170 73.881 -0.022335 -0.022819  1.02410 0.210510 0.270830  1.01620
## 3  6.57730 45.975  0.093362  0.136400  3.42380 0.072886 0.126750  1.87160
## 4  0.35935 12.282  0.122400  0.039196  0.23629 0.628480 0.198220 -0.69738
## 5  0.88928 90.228 -0.042243 -0.019780  1.05900 0.258050 0.135790  1.12950
## 6 19.89300 60.852  0.217090  0.110110 32.44200 0.021626 0.012503  3.47450
##    Attr54     Attr55    Attr56   Attr57  Attr58   Attr59  Attr60  Attr61
## 1 1.66060   1189.700  0.078728  0.24695 0.92586  0.18362  9.0895  8.8342
## 2 1.33400      1.939  0.269720  0.00000 0.79303  0.31271  6.2213  4.2785
## 3 2.06320   6092.300  0.159870  0.13185 0.80748  0.10234  9.4499  3.3443
## 4 0.46946 -13606.000 -0.015327 -0.13730 1.01530 -1.67320 29.2710 14.1410
## 5 2.46710     75.681  0.110170  0.38764 0.93881  1.18420  4.5462 15.1400
## 6 3.47450   3033.300  0.122750  0.25986 0.87268  0.00000  6.8375  5.3717
##    Attr62  Attr63 Attr64 target
## 1 48.1720  7.5770 5.7725      0
## 2 78.4990  4.6497 1.6539      0
## 3 38.8670  9.3910 1.5272      0
## 4 73.4580  4.9688 4.6785      0
## 5 44.1010  8.2764 7.1014      0
## 6  4.0034 91.1710 7.0057      0
tail(bank_data)
##          ID     Attr1   Attr2     Attr3   Attr4    Attr5      Attr6
## 36548 36548 -0.117490 1.10800 -0.224310 0.79755 -201.640 -0.0593250
## 36549 36549  0.425980 0.18731  0.361740 3.00860   50.257  1.2929000
## 36550 36550 -0.016238 0.43902  0.069466 1.16780 -191.590  0.0045487
## 36551 36551  0.073750 0.18138  0.219690 3.50200   66.436  0.1694200
## 36552 36552 -0.871080 1.16990 -0.187060 0.84010  -62.657 -0.0281450
## 36553 36553  0.063897 0.84939  0.285080 1.55680   -3.031 -0.1169200
##           Attr7     Attr8   Attr9   Attr10    Attr11    Attr12    Attr13
## 36548 -0.117490 -0.097464 0.92515 -0.10799 -0.117490 -0.106040 -0.062038
## 36549  0.425980  3.967200 1.38350  0.74311  0.425980  2.365200  0.358000
## 36550 -0.021788  1.108500 0.94926  0.48663 -0.021788 -0.052624  0.052201
## 36551  0.092655  4.513200 1.09190  0.81862  0.092655  1.055300  0.247430
## 36552 -0.871080 -0.145210 1.89250 -0.16988 -0.643470 -0.744590 -0.460280
## 36553  0.083468  0.177310 4.83790  0.15060  0.114700  0.163030  0.022181
##          Attr14   Attr15    Attr16  Attr17    Attr18    Attr19  Attr20
## 36548 -0.117490 -4634.60 -0.078756 0.90254 -0.117490 -0.083527 155.200
## 36549  0.425980   143.04  2.551700 5.33860  0.425980  0.319050  62.803
## 36550 -0.021788  5365.50  0.068027 2.27780 -0.021788 -0.038084 226.770
## 36551  0.092655   272.72  1.338400 5.51320  0.092655  0.094437  14.523
## 36552 -0.871080  -490.20 -0.744590 0.85479 -0.871080 -0.460280  57.237
## 36553  0.083468  2889.20  0.126330 1.17730  0.083468  0.017253  23.837
##        Attr21     Attr22    Attr23     Attr24   Attr25    Attr26   Attr27
## 36548 0.82328 -0.1192400 -0.083527 -0.0593250 -0.10799 -0.078756 -0.78427
## 36549 0.94648  0.4258900  0.319050  1.2929000  0.74311  2.551700  4.41330
## 36550 1.25910 -0.0086335 -0.028383  0.0073332  0.48663  0.080669 -0.14325
## 36551 1.06550  0.1015600  0.075168  0.2137100  0.81862  1.234200  1.13030
## 36552 1.07610 -0.6434700 -0.460280 -1.6029000 -0.89923 -0.744590 -2.82700
## 36553 0.90215  0.0951480  0.013208 -0.0285290  0.10235  0.103290  3.04660
##          Attr28 Attr29   Attr30    Attr31  Attr32  Attr33    Attr34
## 36548  -1.92840 4.3581 0.712570 -0.083527 266.000  1.3722 -0.107620
## 36549   0.78955 4.6393 0.055119  0.319050  68.120  5.3582  2.273700
## 36550   0.13450 4.7117 0.764470 -0.038084 250.750  1.4556 -0.019665
## 36551   0.31723 4.4366 0.101630  0.094437  35.667 10.2330  0.559910
## 36552 -10.88900 1.8321 0.593950 -0.460150 153.610  2.3762  2.376200
## 36553   1.40590 3.7510 0.138870  0.023709  39.432  9.3064  5.609500
##           Attr35  Attr36  Attr37   Attr38    Attr39   Attr40    Attr41
## 36548 -0.1192400 1.42300      NA -0.10799 -0.084773 0.106480 -0.409240
## 36549  0.4258900 1.40480 43.2640  0.75033  0.318990 0.656740  0.012886
## 36550 -0.0086335 0.61816  5.1275  0.51161 -0.015090 0.021591  0.335510
## 36551  0.1015600 1.02290  2.8687  0.91220  0.103510 1.414600  0.023695
## 36552 -0.8873900 1.89250      NA -0.16988 -0.468900 0.039169 -0.060603
## 36553  0.0988740 4.87930  1.6105  0.44934  0.020437 0.364520  0.237950
##          Attr42  Attr43  Attr44    Attr45  Attr46  Attr47    Attr48
## 36548 -0.084773 198.700  43.500 -0.196440 0.25777 143.580 -0.149460
## 36549  0.318990 115.790  52.991  1.854300 1.73300  86.890  0.373890
## 36550 -0.015090 302.770  75.998 -0.045684 0.30930 215.260 -0.060287
## 36551  0.103510  68.186  53.663  1.889200 3.05740  15.858 -0.048547
## 36552 -0.340010 180.720 123.480 -2.935200 0.58643  38.966 -0.643470
## 36553  0.019667  46.055  22.218  0.202240 0.98164  24.204  0.071309
##         Attr49  Attr50   Attr51   Attr52   Attr53   Attr54    Attr55
## 36548 -0.10626 0.79755 1.108000 0.728760 -0.92837 -0.92837 -5116.500
## 36549  0.28004 2.89270 0.180100 0.186630  1.62200  1.63770 15766.000
## 36550 -0.10538 1.10130 0.414040 0.686980  0.94219  0.99054  3576.300
## 36551 -0.04948 1.69530 0.087803 0.097719  1.18210  1.31720  6003.900
## 36552 -0.34001 0.84010 1.169900 0.420840 -9.88950 -9.88950   -12.708
## 36553  0.01474 0.93839 0.511970 0.107450  0.74273  2.21600  1607.000
##          Attr56    Attr57  Attr58   Attr59  Attr60  Attr61  Attr62  Attr63
## 36548 -0.080906  1.088000 1.08090 0.000000  2.3518  8.3909 287.520  1.2695
## 36549  0.277220  0.573240 0.72278 0.009708  5.8118  6.8880  49.236  7.4133
## 36550 -0.053449 -0.033368 1.05340 0.051323  1.6096  4.8027 264.150  1.3818
## 36551  0.084195  0.090091 0.91581 0.114310 25.1320  6.8017  32.664 11.1740
## 36552 -0.468900  5.127500 1.44510 0.000000  6.3770  2.9560 225.630  1.6177
## 36553  0.020425  0.424280 0.97650 1.983700 15.3120 16.4280  38.627  9.4494
##         Attr64 target
## 36548  12.0920      0
## 36549   2.9141      0
## 36550   1.1077      0
## 36551   1.4168      0
## 36552 110.1700      0
## 36553  23.8590      1

Data Description

Attr1 net profit / total assets Attr2 total liabilities / total assets Attr3 working capital / total assets Attr4 current assets / short-term liabilities Attr5 [(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365 Attr6 retained earnings / total assets Attr7 EBIT / total assets Attr8 book value of equity / total liabilities Attr9 sales / total assets Attr10 equity / total assets Attr11 (gross profit + extraordinary items + financial expenses) / total assets Attr12 gross profit / short-term liabilities Attr13 (gross profit + depreciation) / sales Attr14 (gross profit + interest) / total assets Attr15 (total liabilities * 365) / (gross profit + depreciation) Attr16 (gross profit + depreciation) / total liabilities Attr17 total assets / total liabilities Attr18 gross profit / total assets Attr19 gross profit / sales Attr20 (inventory * 365) / sales Attr21 sales (n) / sales (n-1) Attr22 profit on operating activities / total assets Attr23 net profit / sales Attr24 gross profit (in 3 years) / total assets Attr25 (equity - share capital) / total assets Attr26 (net profit + depreciation) / total liabilities Attr27 profit on operating activities / financial expenses Attr28 working capital / fixed assets Attr29 logarithm of total assets Attr30 (total liabilities - cash) / sales Attr31 (gross profit + interest) / sales Attr32 (current liabilities * 365) / cost of products sold Attr33 operating expenses / short-term liabilities Attr34 operating expenses / total liabilities Attr35 profit on sales / total assets Attr36 total sales / total assets Attr37 (current assets - inventories) / long-term liabilities Attr38 constant capital / total assets Attr39 profit on sales / sales Attr40 (current assets - inventory - receivables) / short-term liabilities Attr41 total liabilities / ((profit on operating activities + depreciation) * (12/365)) Attr42 profit on operating activities / sales Attr43 rotation receivables + inventory turnover in days Attr44 (receivables * 365) / sales Attr45 net profit / inventory Attr46 (current assets - inventory) / short-term liabilities Attr47 (inventory * 365) / cost of products sold Attr48 EBITDA (profit on operating activities - depreciation) / total assets Attr49 EBITDA (profit on operating activities - depreciation) / sales Attr50 current assets / total liabilities Attr51 short-term liabilities / total assets Attr52 (short-term liabilities * 365) / cost of products sold) Attr53 equity / fixed assets Attr54 constant capital / fixed assets Attr55 working capital Attr56 (sales - cost of products sold) / sales Attr57 (current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation) Attr58 total costs /total sales Attr59 long-term liabilities / equity Attr60 sales / inventory Attr61 sales / receivables Attr62 (short-term liabilities *365) / sales Attr63 sales / short-term liabilities Attr64 sales / fixed assets

Data Pre-processing

Verify the data types assigned to the variables in the dataset

str(bank_data)
## 'data.frame':    36553 obs. of  66 variables:
##  $ ID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Attr1 : num  0.13537 0.00586 0.1106 0.06391 0.13168 ...
##  $ Attr2 : num  0.452 0.399 0.161 1.407 0.66 ...
##  $ Attr3 : num  0.312 0.198 0.479 -0.296 0.441 ...
##  $ Attr4 : num  2.047 1.939 7.571 0.529 2.71 ...
##  $ Attr5 : num  10.23 9.58 263.9 -46.29 -23.6 ...
##  $ Attr6 : num  0.168 0 0 -0.714 -0.2 ...
##  $ Attr7 : num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr8 : num  1.213 1.509 5.205 -0.331 0.514 ...
##  $ Attr9 : num  2.255 0.979 0.684 0.985 2.136 ...
##  $ Attr10: num  0.548 0.601 0.839 -0.465 0.34 ...
##  $ Attr11: num  0.1833 0.0295 0.1388 0.0791 0.1861 ...
##  $ Attr12: num  0.5632 0.0344 1.8983 0.1258 0.5112 ...
##  $ Attr13: num  0.0892 0.0364 0.2362 0.0303 0.1671 ...
##  $ Attr14: num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr15: num  820 4088 364 5433 675 ...
##  $ Attr16: num  0.4453 0.0893 1.0032 0.0672 0.5405 ...
##  $ Attr17: num  2.213 2.509 6.205 0.711 1.514 ...
##  $ Attr18: num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr19: num  0.07433 0.00739 0.20214 0.02532 0.06176 ...
##  $ Attr20: num  40.2 58.7 38.6 12.5 80.3 ...
##  $ Attr21: num  NA 0.979 1.835 0.449 1.06 ...
##  $ Attr22: num  0.18115 0.00602 0.11669 0.13787 0.18272 ...
##  $ Attr23: num  0.06002 0.00599 0.16159 0.02047 0.06166 ...
##  $ Attr24: num  0.1676 0.0578 0.2354 -0.6893 0.2086 ...
##  $ Attr25: num  0.5115 0.5474 0.8349 -0.4655 0.0482 ...
##  $ Attr26: num  0.3739 0.0858 0.8309 0.0564 0.5401 ...
##  $ Attr27: num  11.554 0.271 249.3 0.435 3.371 ...
##  $ Attr28: num  0.798 0.334 1.069 -0.443 1.467 ...
##  $ Attr29: num  3.58 5.64 4.1 4.66 2.23 ...
##  $ Attr30: num  0.156 0.407 -0.163 0.45 0.306 ...
##  $ Attr31: num  0.07999 0.00739 0.20283 0.02532 0.07901 ...
##  $ Attr32: num  52.3 107.5 46.3 72.3 49.6 ...
##  $ Attr33: num  6.98 3.69 7.89 5.04 7.36 ...
##  $ Attr34: num  4.598 1.95 3.568 0.098 2.878 ...
##  $ Attr35: num  0.178 0.264 0.109 0.138 0.235 ...
##  $ Attr36: num  2.255 0.979 0.684 3.501 2.136 ...
##  $ Attr37: num  3.59 1.33 5.58 0.29 0.57 ...
##  $ Attr38: num  0.649 0.789 0.925 0.313 0.742 ...
##  $ Attr39: num  0.0787 0.2697 0.1599 0.0442 0.1102 ...
##  $ Attr40: num  0.35559 0.1049 3.7693 0.00798 0.34263 ...
##  $ Attr41: num  0.0701 0.3865 0.0384 0.3017 0.054 ...
##  $ Attr42: num  0.08032 0.00615 0.17048 0.04415 0.08556 ...
##  $ Attr43: num  81.5 144 147.8 38.3 104.4 ...
##  $ Attr44: num  41.3 85.3 109.1 25.8 24.1 ...
##  $ Attr45: num  0.5455 0.0373 1.527 0.599 0.2803 ...
##  $ Attr46: num  1.213 1.192 6.577 0.359 0.889 ...
##  $ Attr47: num  43.6 73.9 46 12.3 90.2 ...
##  $ Attr48: num  0.1475 -0.0223 0.0934 0.1224 -0.0422 ...
##  $ Attr49: num  0.0654 -0.0228 0.1364 0.0392 -0.0198 ...
##  $ Attr50: num  1.348 1.024 3.424 0.236 1.059 ...
##  $ Attr51: num  0.2977 0.2105 0.0729 0.6285 0.2581 ...
##  $ Attr52: num  0.143 0.271 0.127 0.198 0.136 ...
##  $ Attr53: num  1.403 1.016 1.872 -0.697 1.129 ...
##  $ Attr54: num  1.661 1.334 2.063 0.469 2.467 ...
##  $ Attr55: num  1189.7 1.94 6092.3 -13606 75.68 ...
##  $ Attr56: num  0.0787 0.2697 0.1599 -0.0153 0.1102 ...
##  $ Attr57: num  0.247 0 0.132 -0.137 0.388 ...
##  $ Attr58: num  0.926 0.793 0.807 1.015 0.939 ...
##  $ Attr59: num  0.184 0.313 0.102 -1.673 1.184 ...
##  $ Attr60: num  9.09 6.22 9.45 29.27 4.55 ...
##  $ Attr61: num  8.83 4.28 3.34 14.14 15.14 ...
##  $ Attr62: num  48.2 78.5 38.9 73.5 44.1 ...
##  $ Attr63: num  7.58 4.65 9.39 4.97 8.28 ...
##  $ Attr64: num  5.77 1.65 1.53 4.68 7.1 ...
##  $ target: int  0 0 0 0 0 0 0 0 0 0 ...

Plot the data to understand

par(mfrow = c(2,2))

plot(bank_data[,"Attr9"],bank_data[,"Attr10"],xlab="sale / total assets",ylab="equity / total assets",type="p",main="sales and equity" )
plot(bank_data[,"Attr18"],bank_data[,"Attr24"],xlab="gross profit / total assets",ylab="gross profit (in 3 years) / total assets",type="p",main="gross profit now and in 3 years" )
plot(bank_data[,"Attr22"],bank_data[,"Attr7"],xlab="profit on operating activities / total assets ",ylab="EBIT / total assets",type="p",main="EBIT and profit on operating activities" )
plot(bank_data[,"Attr2"],bank_data[,"Attr3"],xlab="total liabilities / total assets",ylab="working capital / total assets",type="p",main="working capital and total liabilities" )

Feature Engineering

Substracting current gross income from gross income in 3 years

str(bank_data)
## 'data.frame':    36553 obs. of  66 variables:
##  $ ID    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Attr1 : num  0.13537 0.00586 0.1106 0.06391 0.13168 ...
##  $ Attr2 : num  0.452 0.399 0.161 1.407 0.66 ...
##  $ Attr3 : num  0.312 0.198 0.479 -0.296 0.441 ...
##  $ Attr4 : num  2.047 1.939 7.571 0.529 2.71 ...
##  $ Attr5 : num  10.23 9.58 263.9 -46.29 -23.6 ...
##  $ Attr6 : num  0.168 0 0 -0.714 -0.2 ...
##  $ Attr7 : num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr8 : num  1.213 1.509 5.205 -0.331 0.514 ...
##  $ Attr9 : num  2.255 0.979 0.684 0.985 2.136 ...
##  $ Attr10: num  0.548 0.601 0.839 -0.465 0.34 ...
##  $ Attr11: num  0.1833 0.0295 0.1388 0.0791 0.1861 ...
##  $ Attr12: num  0.5632 0.0344 1.8983 0.1258 0.5112 ...
##  $ Attr13: num  0.0892 0.0364 0.2362 0.0303 0.1671 ...
##  $ Attr14: num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr15: num  820 4088 364 5433 675 ...
##  $ Attr16: num  0.4453 0.0893 1.0032 0.0672 0.5405 ...
##  $ Attr17: num  2.213 2.509 6.205 0.711 1.514 ...
##  $ Attr18: num  0.16763 0.00724 0.13836 0.07907 0.13191 ...
##  $ Attr19: num  0.07433 0.00739 0.20214 0.02532 0.06176 ...
##  $ Attr20: num  40.2 58.7 38.6 12.5 80.3 ...
##  $ Attr21: num  NA 0.979 1.835 0.449 1.06 ...
##  $ Attr22: num  0.18115 0.00602 0.11669 0.13787 0.18272 ...
##  $ Attr23: num  0.06002 0.00599 0.16159 0.02047 0.06166 ...
##  $ Attr24: num  0.1676 0.0578 0.2354 -0.6893 0.2086 ...
##  $ Attr25: num  0.5115 0.5474 0.8349 -0.4655 0.0482 ...
##  $ Attr26: num  0.3739 0.0858 0.8309 0.0564 0.5401 ...
##  $ Attr27: num  11.554 0.271 249.3 0.435 3.371 ...
##  $ Attr28: num  0.798 0.334 1.069 -0.443 1.467 ...
##  $ Attr29: num  3.58 5.64 4.1 4.66 2.23 ...
##  $ Attr30: num  0.156 0.407 -0.163 0.45 0.306 ...
##  $ Attr31: num  0.07999 0.00739 0.20283 0.02532 0.07901 ...
##  $ Attr32: num  52.3 107.5 46.3 72.3 49.6 ...
##  $ Attr33: num  6.98 3.69 7.89 5.04 7.36 ...
##  $ Attr34: num  4.598 1.95 3.568 0.098 2.878 ...
##  $ Attr35: num  0.178 0.264 0.109 0.138 0.235 ...
##  $ Attr36: num  2.255 0.979 0.684 3.501 2.136 ...
##  $ Attr37: num  3.59 1.33 5.58 0.29 0.57 ...
##  $ Attr38: num  0.649 0.789 0.925 0.313 0.742 ...
##  $ Attr39: num  0.0787 0.2697 0.1599 0.0442 0.1102 ...
##  $ Attr40: num  0.35559 0.1049 3.7693 0.00798 0.34263 ...
##  $ Attr41: num  0.0701 0.3865 0.0384 0.3017 0.054 ...
##  $ Attr42: num  0.08032 0.00615 0.17048 0.04415 0.08556 ...
##  $ Attr43: num  81.5 144 147.8 38.3 104.4 ...
##  $ Attr44: num  41.3 85.3 109.1 25.8 24.1 ...
##  $ Attr45: num  0.5455 0.0373 1.527 0.599 0.2803 ...
##  $ Attr46: num  1.213 1.192 6.577 0.359 0.889 ...
##  $ Attr47: num  43.6 73.9 46 12.3 90.2 ...
##  $ Attr48: num  0.1475 -0.0223 0.0934 0.1224 -0.0422 ...
##  $ Attr49: num  0.0654 -0.0228 0.1364 0.0392 -0.0198 ...
##  $ Attr50: num  1.348 1.024 3.424 0.236 1.059 ...
##  $ Attr51: num  0.2977 0.2105 0.0729 0.6285 0.2581 ...
##  $ Attr52: num  0.143 0.271 0.127 0.198 0.136 ...
##  $ Attr53: num  1.403 1.016 1.872 -0.697 1.129 ...
##  $ Attr54: num  1.661 1.334 2.063 0.469 2.467 ...
##  $ Attr55: num  1189.7 1.94 6092.3 -13606 75.68 ...
##  $ Attr56: num  0.0787 0.2697 0.1599 -0.0153 0.1102 ...
##  $ Attr57: num  0.247 0 0.132 -0.137 0.388 ...
##  $ Attr58: num  0.926 0.793 0.807 1.015 0.939 ...
##  $ Attr59: num  0.184 0.313 0.102 -1.673 1.184 ...
##  $ Attr60: num  9.09 6.22 9.45 29.27 4.55 ...
##  $ Attr61: num  8.83 4.28 3.34 14.14 15.14 ...
##  $ Attr62: num  48.2 78.5 38.9 73.5 44.1 ...
##  $ Attr63: num  7.58 4.65 9.39 4.97 8.28 ...
##  $ Attr64: num  5.77 1.65 1.53 4.68 7.1 ...
##  $ target: int  0 0 0 0 0 0 0 0 0 0 ...
bank_data1= cbind(bank_data,data.frame(bank_data$Attr24-bank_data$Attr18))

Check for missing values

sum(is.na(bank_data))
## [1] 35187
bank_data=centralImputation(bank_data)
sum(is.na(bank_data))
## [1] 0

Check for class imbalance

prop.table(table(bank_data$target))
## 
##          0          1 
## 0.95171395 0.04828605
bank_data_rose <- ROSE(target~ ., data=bank_data, seed=111)$data
prop.table(table(bank_data_rose$target))
## 
##        0        1 
## 0.500807 0.499193

Find the corelation between the features.

cat_var="target"
num_var=setdiff(names(bank_data),cat_var)
corrplot( cor(bank_data_rose[,num_var]), method="circle", tl.cex = 0.5, tl.col = 'black',  order = "hclust", diag = FALSE)

corrplot(cor(bank_data_rose[,num_var]), method="shade",type = "full")

Split the Data into train and test sets

  • Use stratified sampling to split the data into train/test sets (70/30)

  • Use the createDataPartition() function from the caret package to do stratified sampling

# Set the seed after attaching the caret package

set.seed(111)

# The first argument is the imbalanced class reference variable, the second is the proportion to sample

# Remember to include list = F as the function returns a list otherwise which would not be able to subset a dataframe

trainIndex <- createDataPartition(bank_data$target, p = .7, list = F)

train_data <- bank_data[trainIndex, ]

test_data <- bank_data[-trainIndex, ]

Build a Decision Tree

Model the tree

  • We will be using Quinlan’s C5.0 decision tree algorithm implementation from the C50 package to build our decision tree
train_data$target<-as.factor(train_data$target)
test_data$target<-as.factor(test_data$target)
str(train_data$target)
##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
c5_tree <- C5.0(target ~ . , train_data)

# Use the rules = T argument if you want to extract rules later from the model

c5_rules <- C5.0(target ~ . , train_data, rules = T)

Variable Importance in trees

  • Find the importance of each variable in the dataset using the c5imp() function

  • The default metric “usage” in the c5imp function gives the percentage of data being split by using the attribute at that particular time. So variable used for splitting at the root node always has 100, and the variables at the leaf nodes might be close to 0 if there is no data remaining to classify

C5imp(c5_tree, metric = "usage")
##        Overall
## Attr41  100.00
## Attr27   98.93
## Attr46   93.18
## Attr24   72.54
## Attr34   27.54
## Attr5    23.10
## Attr56   11.09
## Attr58    9.90
## Attr35    9.74
## Attr8     6.68
## Attr60    6.53
## Attr42    5.26
## Attr9     4.40
## Attr6     2.86
## Attr15    2.78
## Attr25    1.73
## Attr44    1.15
## Attr23    0.50
## Attr22    0.42
## Attr13    0.35
## Attr39    0.28
## Attr19    0.25
## Attr48    0.19
## Attr38    0.16
## Attr53    0.16
## Attr64    0.13
## Attr29    0.05
## ID        0.00
## Attr1     0.00
## Attr2     0.00
## Attr3     0.00
## Attr4     0.00
## Attr7     0.00
## Attr10    0.00
## Attr11    0.00
## Attr12    0.00
## Attr14    0.00
## Attr16    0.00
## Attr17    0.00
## Attr18    0.00
## Attr20    0.00
## Attr21    0.00
## Attr26    0.00
## Attr28    0.00
## Attr30    0.00
## Attr31    0.00
## Attr32    0.00
## Attr33    0.00
## Attr36    0.00
## Attr37    0.00
## Attr40    0.00
## Attr43    0.00
## Attr45    0.00
## Attr47    0.00
## Attr49    0.00
## Attr50    0.00
## Attr51    0.00
## Attr52    0.00
## Attr54    0.00
## Attr55    0.00
## Attr57    0.00
## Attr59    0.00
## Attr61    0.00
## Attr62    0.00
## Attr63    0.00

Rules from trees

  • Understand the summary of the returned c5.0 rules based on the decision tree model
summary(c5_rules)
## 
## Call:
## C5.0.formula(formula = target ~ ., data = train_data, rules = T)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Jun 22 21:28:43 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 25588 cases (66 attributes) from undefined.data
## 
## Rules:
## 
## Rule 1: (1868, lift 1.0)
##  Attr27 <= 0.87682
##  Attr34 <= 0.078963
##  Attr46 > 0.5449
##  ->  class 0  [0.999]
## 
## Rule 2: (174, lift 1.0)
##  Attr5 > -22.458
##  Attr5 <= 82.901
##  Attr41 > -0.000156
##  Attr58 <= 0.34016
##  ->  class 0  [0.994]
## 
## Rule 3: (592/7, lift 1.0)
##  Attr34 <= 0.22583
##  Attr41 > -0.000156
##  Attr46 > 0.5449
##  Attr56 > 0.068701
##  ->  class 0  [0.987]
## 
## Rule 4: (8685/121, lift 1.0)
##  Attr27 <= 0.96568
##  Attr41 > -0.000156
##  ->  class 0  [0.986]
## 
## Rule 5: (623/9, lift 1.0)
##  Attr5 <= -194.98
##  Attr27 > 0.96568
##  Attr34 > 0.14597
##  Attr58 > 0.77596
##  ->  class 0  [0.984]
## 
## Rule 6: (2151/43, lift 1.0)
##  Attr5 > 176.99
##  ->  class 0  [0.980]
## 
## Rule 7: (7239/176, lift 1.0)
##  Attr42 > 0.082447
##  ->  class 0  [0.976]
## 
## Rule 8: (18750/545, lift 1.0)
##  Attr34 > 0.14597
##  Attr41 > -0.000156
##  Attr46 > 0.18877
##  ->  class 0  [0.971]
## 
## Rule 9: (11760/342, lift 1.0)
##  Attr8 > 0.33112
##  Attr27 > 0.96568
##  Attr34 > 0.20488
##  ->  class 0  [0.971]
## 
## Rule 10: (3918/360, lift 1.0)
##  Attr27 <= -0.0305
##  ->  class 0  [0.908]
## 
## Rule 11: (89, lift 20.7)
##  Attr6 > 0.033369
##  Attr27 > 1.0896
##  Attr27 <= 1.09005
##  Attr34 <= 1.577
##  Attr35 > 0.063439
##  ->  class 1  [0.989]
## 
## Rule 12: (72/1, lift 20.4)
##  Attr5 > -22.458
##  Attr5 <= 82.901
##  Attr41 > -0.000156
##  Attr44 > 23.394
##  Attr46 <= 0.5449
##  Attr58 > 0.34016
##  ->  class 1  [0.973]
## 
## Rule 13: (31, lift 20.3)
##  Attr5 > -50.6
##  Attr5 <= 23.747
##  Attr34 > 0.001229
##  Attr42 <= -0.023691
##  Attr44 > 25.918
##  Attr46 <= 0.40497
##  ->  class 1  [0.970]
## 
## Rule 14: (104/3, lift 20.1)
##  Attr27 > -0.0305
##  Attr34 <= -0.015941
##  Attr41 <= -0.000156
##  ->  class 1  [0.962]
## 
## Rule 15: (24, lift 20.1)
##  Attr5 > -194.98
##  Attr8 <= 0.33112
##  Attr27 > 0.96568
##  Attr44 > 23.023
##  Attr46 <= 0.18877
##  Attr58 > 0.77596
##  ->  class 1  [0.962]
## 
## Rule 16: (19, lift 19.9)
##  Attr5 > -31.004
##  Attr5 <= 23.747
##  Attr27 <= -0.0305
##  Attr34 > 0.001229
##  Attr41 <= -0.000156
##  Attr46 <= 0.59928
##  Attr58 <= 1.0202
##  ->  class 1  [0.952]
## 
## Rule 17: (15, lift 19.7)
##  Attr8 <= 0.33112
##  Attr39 <= 0.086457
##  Attr41 > -0.000156
##  Attr42 > 0.020436
##  Attr46 <= 0.5449
##  Attr58 <= 0.77596
##  ->  class 1  [0.941]
## 
## Rule 18: (32/1, lift 19.7)
##  Attr5 > -22.458
##  Attr5 <= 82.901
##  Attr27 > 0.67238
##  Attr46 <= 0.5449
##  Attr53 <= 0.69144
##  ->  class 1  [0.941]
## 
## Rule 19: (70/4, lift 19.5)
##  Attr5 > -986.05
##  Attr5 <= 176.99
##  Attr27 <= -0.0305
##  Attr34 > 0.0070654
##  Attr35 <= -0.0034142
##  Attr56 > 0.065998
##  ->  class 1  [0.931]
## 
## Rule 20: (77/6, lift 19.1)
##  Attr5 > -22.458
##  Attr5 <= 82.901
##  Attr41 > -0.000156
##  Attr44 > 23.394
##  Attr46 <= 0.5449
##  ->  class 1  [0.911]
## 
## Rule 21: (29/3, lift 18.2)
##  Attr5 <= 23.747
##  Attr23 <= -0.021107
##  Attr34 > 0.001229
##  Attr41 <= -0.000156
##  Attr56 <= 0.065998
##  Attr58 <= 1.0202
##  Attr64 > 1.2237
##  ->  class 1  [0.871]
## 
## Rule 22: (5, lift 17.9)
##  Attr27 > -0.0305
##  Attr27 <= -0.0034433
##  Attr34 > 0.71601
##  Attr41 <= -0.000156
##  ->  class 1  [0.857]
## 
## Rule 23: (7/1, lift 16.3)
##  Attr8 <= 0.33112
##  Attr29 <= 3.2812
##  Attr34 > 0.14597
##  Attr42 > 0.020436
##  Attr46 <= 0.5449
##  Attr58 <= 0.77596
##  ->  class 1  [0.778]
## 
## Rule 24: (34/8, lift 15.7)
##  Attr5 <= 23.747
##  Attr23 <= -0.021107
##  Attr34 > 0.001229
##  Attr41 <= -0.000156
##  Attr56 <= 0.065998
##  Attr58 <= 1.0202
##  ->  class 1  [0.750]
## 
## Rule 25: (24/8, lift 13.7)
##  Attr8 <= 0.33112
##  Attr34 > 0.14597
##  Attr42 > 0.020436
##  Attr46 <= 0.5449
##  Attr58 <= 0.77596
##  ->  class 1  [0.654]
## 
## Rule 26: (441/164, lift 13.1)
##  Attr24 <= 0.067406
##  Attr27 > 0.87682
##  Attr34 <= 0.22583
##  ->  class 1  [0.628]
## 
## Rule 27: (214/95, lift 11.6)
##  Attr5 > -22.458
##  Attr5 <= 82.901
##  Attr46 <= 0.5449
##  ->  class 1  [0.556]
## 
## Default class: 0
## 
## 
## Evaluation on training data (25588 cases):
## 
##          Rules     
##    ----------------
##      No      Errors
## 
##      27  773( 3.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##   24336    29    (a): class 0
##     744   479    (b): class 1
## 
## 
##  Attribute usage:
## 
##   93.09% Attr27
##   87.30% Attr41
##   84.40% Attr34
##   82.03% Attr46
##   46.16% Attr8
##   28.45% Attr42
##   12.80% Attr5
##    3.75% Attr58
##    2.72% Attr56
##    1.72% Attr24
##    0.62% Attr35
##    0.48% Attr44
##    0.35% Attr6
##    0.13% Attr23
##    0.13% Attr53
##    0.11% Attr64
##    0.06% Attr39
##    0.03% Attr29
## 
## 
## Time: 6.7 secs
  • From the output of the summary above, you can clearly understand the rules and their associated metrics such as lift and support

  • This is great for explicability and can also be used for understanding interesting relationships in data, even if your final model is not a decision tree

Plotting the tree

  • Call the plot function on the tree object to visualize the tree
plot(c5_tree)

Evaluating the model

Predictions on the test data

  • We’ll evaluate the decision tree using the standard error metrics on test data
preds <- predict(c5_tree, train_data)
preds1 <- predict(c5_tree, test_data)
  • Error metrics for classification can be accessed through the “confusionMatrix()” function from the caret package
conf_train=confusionMatrix(preds, train_data$target, positive = "0")
conf_test=confusionMatrix(preds1, test_data$target, positive = "0")
conf_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24342   632
##          1    23   591
##                                           
##                Accuracy : 0.9744          
##                  95% CI : (0.9724, 0.9763)
##     No Information Rate : 0.9522          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6317          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9991          
##             Specificity : 0.4832          
##          Pos Pred Value : 0.9747          
##          Neg Pred Value : 0.9625          
##              Prevalence : 0.9522          
##          Detection Rate : 0.9513          
##    Detection Prevalence : 0.9760          
##       Balanced Accuracy : 0.7411          
##                                           
##        'Positive' Class : 0               
## 
conf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10384   301
##          1    39   241
##                                           
##                Accuracy : 0.969           
##                  95% CI : (0.9656, 0.9722)
##     No Information Rate : 0.9506          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.572           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9963          
##             Specificity : 0.4446          
##          Pos Pred Value : 0.9718          
##          Neg Pred Value : 0.8607          
##              Prevalence : 0.9506          
##          Detection Rate : 0.9470          
##    Detection Prevalence : 0.9745          
##       Balanced Accuracy : 0.7205          
##                                           
##        'Positive' Class : 0               
## 

Finding the F1 score since it is very important to have high precision and recall for this problem

CART Trees

  • The classification and regression trees use gini index in place of the gain ratio (based on information gain) used by the ID3 based algorithms, such as c4.5 and c5.0

Goal

  • The goal of this activity is to predict the imbd score of a movie using a classification and regression tree (cart)
reg_tree <- rpart(target ~ ., train_data,method='class')

printcp(reg_tree)
## 
## Classification tree:
## rpart(formula = target ~ ., data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] Attr24 Attr27 Attr34 Attr35 Attr56
## 
## Root node error: 1223/25588 = 0.047796
## 
## n= 25588 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.027187      0   1.00000 1.00000 0.027903
## 2 0.024530      4   0.89125 0.92723 0.026918
## 3 0.010000      8   0.79313 0.80948 0.025225

Tree Explicability

  • The variable importance can accessed accessing variable.importance from the reg.tree list
reg_tree$variable.importance
##      Attr34      Attr56      Attr39      Attr41      Attr35      Attr22 
## 266.0787251 175.2489122  86.2233824  71.8981654  69.5123896  65.9275392 
##      Attr55      Attr58      Attr27      Attr42      Attr33      Attr29 
##  63.1595004  60.1780071  59.7002362  56.7521977  55.7751874  48.6854482 
##       Attr9      Attr36      Attr15      Attr14      Attr18       Attr7 
##  42.9363953  31.9695663  30.2639273  24.7714078  24.7714078  24.7634139 
##      Attr24      Attr19      Attr52      Attr30      Attr32      Attr12 
##  21.1439300  19.7373439  12.0567802  11.0083645  10.9527372   6.9107476 
##       Attr1      Attr48      Attr11      Attr47      Attr10      Attr25 
##   6.5390301   5.7424230   5.4991000   0.9371612   0.4685806   0.4685806
  • We can plot the regression tree using the rpart.plot() function from the rpart.plot package
rpart.plot(reg_tree)

## Evaluating the model

Predictions on the test data

  • We’ll evaluate the decision tree using the standard error metrics on test data
summary(reg_tree) # detailed summary of splits
## Call:
## rpart(formula = target ~ ., data = train_data, method = "class")
##   n= 25588 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.02718724      0 1.0000000 1.0000000 0.02790306
## 2 0.02452984      4 0.8912510 0.9272281 0.02691763
## 3 0.01000000      8 0.7931316 0.8094849 0.02522452
## 
## Variable importance
## Attr34 Attr56 Attr39 Attr41 Attr35 Attr22 Attr55 Attr58 Attr27 Attr42 
##     20     13      6      5      5      5      5      4      4      4 
## Attr33 Attr29  Attr9 Attr36 Attr15 Attr14 Attr18  Attr7 Attr24 Attr19 
##      4      4      3      2      2      2      2      2      2      1 
## Attr52 Attr30 Attr32 Attr12 
##      1      1      1      1 
## 
## Node number 1: 25588 observations,    complexity param=0.02718724
##   predicted class=0  expected loss=0.04779584  P(node) =1
##     class counts: 24365  1223
##    probabilities: 0.952 0.048 
##   left son=2 (22195 obs) right son=3 (3393 obs)
##   Primary splits:
##       Attr35 < -0.0393365   to the right, improve=69.51239, (0 missing)
##       Attr39 < -0.035456    to the right, improve=69.15036, (0 missing)
##       Attr41 < -0.0128205   to the right, improve=67.27618, (0 missing)
##       Attr26 < 0.032943     to the right, improve=63.03163, (0 missing)
##       Attr24 < 0.053813     to the right, improve=62.76884, (0 missing)
##   Surrogate splits:
##       Attr39 < -0.0238625   to the right, agree=0.974, adj=0.808, (0 split)
##       Attr22 < -0.0393375   to the right, agree=0.967, adj=0.749, (0 split)
##       Attr56 < -0.021217    to the right, agree=0.956, adj=0.665, (0 split)
##       Attr42 < -0.019477    to the right, agree=0.950, adj=0.625, (0 split)
##       Attr41 < -0.000131595 to the right, agree=0.944, adj=0.580, (0 split)
## 
## Node number 2: 22195 observations,    complexity param=0.02452984
##   predicted class=0  expected loss=0.0333859  P(node) =0.8673988
##     class counts: 21454   741
##    probabilities: 0.967 0.033 
##   left son=4 (16905 obs) right son=5 (5290 obs)
##   Primary splits:
##       Attr24 < 0.053813     to the right, improve=21.14393, (0 missing)
##       Attr46 < 0.470925     to the right, improve=18.38534, (0 missing)
##       Attr26 < 0.124525     to the right, improve=16.31054, (0 missing)
##       Attr16 < 0.138375     to the right, improve=15.81470, (0 missing)
##       Attr38 < 0.495755     to the right, improve=11.83234, (0 missing)
##   Surrogate splits:
##       Attr14 < 0.018395     to the right, agree=0.843, adj=0.343, (0 split)
##       Attr18 < 0.018395     to the right, agree=0.843, adj=0.343, (0 split)
##       Attr7  < 0.018395     to the right, agree=0.843, adj=0.342, (0 split)
##       Attr12 < 0.038155     to the right, agree=0.840, adj=0.327, (0 split)
##       Attr1  < 0.0124095    to the right, agree=0.835, adj=0.309, (0 split)
## 
## Node number 3: 3393 observations,    complexity param=0.02718724
##   predicted class=0  expected loss=0.1420572  P(node) =0.1326012
##     class counts:  2911   482
##    probabilities: 0.858 0.142 
##   left son=6 (3283 obs) right son=7 (110 obs)
##   Primary splits:
##       Attr56 < 0.0681095    to the left,  improve=51.54386, (0 missing)
##       Attr58 < 0.87463      to the right, improve=27.36437, (0 missing)
##       Attr27 < 0.988425     to the left,  improve=26.46364, (0 missing)
##       Attr46 < 0.38793      to the right, improve=25.38936, (0 missing)
##       Attr40 < 0.069154     to the right, improve=18.24269, (0 missing)
##   Surrogate splits:
##       Attr9  < -0.0161855   to the right, agree=0.968, adj=0.018, (0 split)
##       Attr47 < -1.953635    to the right, agree=0.968, adj=0.018, (0 split)
##       Attr10 < 1.3329       to the left,  agree=0.968, adj=0.009, (0 split)
##       Attr25 < 1.33203      to the left,  agree=0.968, adj=0.009, (0 split)
##       Attr32 < 16740.5      to the left,  agree=0.968, adj=0.009, (0 split)
## 
## Node number 4: 16905 observations
##   predicted class=0  expected loss=0.02117717  P(node) =0.6606612
##     class counts: 16547   358
##    probabilities: 0.979 0.021 
## 
## Node number 5: 5290 observations,    complexity param=0.02452984
##   predicted class=0  expected loss=0.07240076  P(node) =0.2067375
##     class counts:  4907   383
##    probabilities: 0.928 0.072 
##   left son=10 (3354 obs) right son=11 (1936 obs)
##   Primary splits:
##       Attr27 < 1.0641       to the left,  improve=31.404890, (0 missing)
##       Attr6  < 0.000283725  to the left,  improve= 8.278168, (0 missing)
##       Attr46 < 0.22588      to the right, improve= 7.193995, (0 missing)
##       Attr24 < -0.08189     to the left,  improve= 6.509649, (0 missing)
##       Attr22 < 0.0100565    to the left,  improve= 4.900454, (0 missing)
##   Surrogate splits:
##       Attr48 < 0.011722     to the left,  agree=0.701, adj=0.183, (0 split)
##       Attr11 < 0.0393115    to the left,  agree=0.698, adj=0.175, (0 split)
##       Attr7  < 0.0237045    to the left,  agree=0.697, adj=0.173, (0 split)
##       Attr14 < 0.0237045    to the left,  agree=0.697, adj=0.173, (0 split)
##       Attr18 < 0.0237045    to the left,  agree=0.697, adj=0.173, (0 split)
## 
## Node number 6: 3283 observations,    complexity param=0.02718724
##   predicted class=0  expected loss=0.1261042  P(node) =0.1283023
##     class counts:  2869   414
##    probabilities: 0.874 0.126 
##   left son=12 (2666 obs) right son=13 (617 obs)
##   Primary splits:
##       Attr27 < 0.988425     to the left,  improve=28.29535, (0 missing)
##       Attr46 < 0.63169      to the right, improve=20.93660, (0 missing)
##       Attr40 < 0.0547355    to the right, improve=16.41326, (0 missing)
##       Attr35 < -0.321925    to the right, improve=12.23396, (0 missing)
##       Attr3  < -0.508195    to the right, improve=11.71989, (0 missing)
##   Surrogate splits:
##       Attr22 < 0.00320755   to the left,  agree=0.904, adj=0.489, (0 split)
##       Attr42 < 0.0014794    to the left,  agree=0.900, adj=0.470, (0 split)
##       Attr7  < 0.00135305   to the left,  agree=0.892, adj=0.428, (0 split)
##       Attr14 < 0.00135305   to the left,  agree=0.892, adj=0.428, (0 split)
##       Attr18 < 0.00135305   to the left,  agree=0.892, adj=0.428, (0 split)
## 
## Node number 7: 110 observations
##   predicted class=1  expected loss=0.3818182  P(node) =0.00429889
##     class counts:    42    68
##    probabilities: 0.382 0.618 
## 
## Node number 10: 3354 observations
##   predicted class=0  expected loss=0.03100775  P(node) =0.1310771
##     class counts:  3250   104
##    probabilities: 0.969 0.031 
## 
## Node number 11: 1936 observations,    complexity param=0.02452984
##   predicted class=0  expected loss=0.1441116  P(node) =0.07566047
##     class counts:  1657   279
##    probabilities: 0.856 0.144 
##   left son=22 (1697 obs) right son=23 (239 obs)
##   Primary splits:
##       Attr34 < 0.159565     to the right, improve=125.28570, (0 missing)
##       Attr6  < 0.000283725  to the left,  improve= 57.38552, (0 missing)
##       Attr27 < 1.090225     to the right, improve= 48.32148, (0 missing)
##       Attr9  < 1.1032       to the right, improve= 34.39527, (0 missing)
##       Attr29 < 4.00925      to the left,  improve= 31.25859, (0 missing)
##   Surrogate splits:
##       Attr33 < 0.25645      to the right, agree=0.902, adj=0.205, (0 split)
##       Attr56 < 0.967725     to the left,  agree=0.890, adj=0.113, (0 split)
##       Attr52 < 3.9009       to the left,  agree=0.888, adj=0.096, (0 split)
##       Attr30 < 3.4617       to the left,  agree=0.887, adj=0.088, (0 split)
##       Attr32 < 1423.8       to the left,  agree=0.887, adj=0.084, (0 split)
## 
## Node number 12: 2666 observations
##   predicted class=0  expected loss=0.09452363  P(node) =0.1041895
##     class counts:  2414   252
##    probabilities: 0.905 0.095 
## 
## Node number 13: 617 observations,    complexity param=0.02718724
##   predicted class=0  expected loss=0.2625608  P(node) =0.02411287
##     class counts:   455   162
##    probabilities: 0.737 0.263 
##   left son=26 (510 obs) right son=27 (107 obs)
##   Primary splits:
##       Attr34 < 0.034667     to the right, improve=140.79310, (0 missing)
##       Attr42 < -0.017354    to the right, improve= 67.08098, (0 missing)
##       Attr19 < -0.028688    to the right, improve= 64.23393, (0 missing)
##       Attr31 < -0.028688    to the right, improve= 64.23393, (0 missing)
##       Attr23 < -0.022592    to the right, improve= 63.92920, (0 missing)
##   Surrogate splits:
##       Attr55 < -2890.9      to the right, agree=0.904, adj=0.449, (0 split)
##       Attr29 < 4.3103       to the left,  agree=0.887, adj=0.346, (0 split)
##       Attr41 < -0.10932     to the right, agree=0.865, adj=0.224, (0 split)
##       Attr15 < -1189.6      to the right, agree=0.864, adj=0.215, (0 split)
##       Attr19 < -0.11616     to the right, agree=0.851, adj=0.140, (0 split)
## 
## Node number 22: 1697 observations
##   predicted class=0  expected loss=0.07660577  P(node) =0.06632015
##     class counts:  1567   130
##    probabilities: 0.923 0.077 
## 
## Node number 23: 239 observations,    complexity param=0.02452984
##   predicted class=1  expected loss=0.376569  P(node) =0.009340316
##     class counts:    90   149
##    probabilities: 0.377 0.623 
##   left son=46 (101 obs) right son=47 (138 obs)
##   Primary splits:
##       Attr56 < 0.05258875   to the right, improve=63.31228, (0 missing)
##       Attr58 < 0.9511       to the left,  improve=62.93555, (0 missing)
##       Attr33 < 0.96864      to the left,  improve=43.04750, (0 missing)
##       Attr36 < 0.887245     to the left,  improve=39.93445, (0 missing)
##       Attr27 < 1.092825     to the right, improve=36.18642, (0 missing)
##   Surrogate splits:
##       Attr58 < 0.9511       to the left,  agree=0.979, adj=0.950, (0 split)
##       Attr9  < 1.0591       to the right, agree=0.858, adj=0.663, (0 split)
##       Attr36 < 0.8505       to the left,  agree=0.791, adj=0.505, (0 split)
##       Attr33 < 2.01405      to the left,  agree=0.778, adj=0.475, (0 split)
##       Attr39 < 0.0570895    to the right, agree=0.778, adj=0.475, (0 split)
## 
## Node number 26: 510 observations
##   predicted class=0  expected loss=0.1078431  P(node) =0.01993122
##     class counts:   455    55
##    probabilities: 0.892 0.108 
## 
## Node number 27: 107 observations
##   predicted class=1  expected loss=0  P(node) =0.004181648
##     class counts:     0   107
##    probabilities: 0.000 1.000 
## 
## Node number 46: 101 observations
##   predicted class=0  expected loss=0.1980198  P(node) =0.003947163
##     class counts:    81    20
##    probabilities: 0.802 0.198 
## 
## Node number 47: 138 observations
##   predicted class=1  expected loss=0.06521739  P(node) =0.005393153
##     class counts:     9   129
##    probabilities: 0.065 0.935
pred2 = predict(reg_tree, train_data,type = "class")
pred3 = predict(reg_tree, test_data,type = "class")

table(pred2, train_data$target)
##      
## pred2     0     1
##     0 24314   919
##     1    51   304
table(pred3, test_data$target)
##      
## pred3     0     1
##     0 10397   401
##     1    26   141

Error metrics for classification can be accessed through the “confusionMatrix()” function from the caret package

conf_train=confusionMatrix(pred2, train_data$target, positive = "0")
conf_test=confusionMatrix(pred3, test_data$target, positive = "0")
conf_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24314   919
##          1    51   304
##                                           
##                Accuracy : 0.9621          
##                  95% CI : (0.9597, 0.9644)
##     No Information Rate : 0.9522          
##     P-Value [Acc > NIR] : 9.278e-15       
##                                           
##                   Kappa : 0.3718          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9979          
##             Specificity : 0.2486          
##          Pos Pred Value : 0.9636          
##          Neg Pred Value : 0.8563          
##              Prevalence : 0.9522          
##          Detection Rate : 0.9502          
##    Detection Prevalence : 0.9861          
##       Balanced Accuracy : 0.6232          
##                                           
##        'Positive' Class : 0               
## 
conf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10397   401
##          1    26   141
##                                           
##                Accuracy : 0.9611          
##                  95% CI : (0.9573, 0.9646)
##     No Information Rate : 0.9506          
##     P-Value [Acc > NIR] : 8.803e-08       
##                                           
##                   Kappa : 0.3834          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9975          
##             Specificity : 0.2601          
##          Pos Pred Value : 0.9629          
##          Neg Pred Value : 0.8443          
##              Prevalence : 0.9506          
##          Detection Rate : 0.9482          
##    Detection Prevalence : 0.9848          
##       Balanced Accuracy : 0.6288          
##                                           
##        'Positive' Class : 0               
## 

Finding the F1 score since it is very important to have high precision and recall for this problem

Build KNN model

  # N = 1/3/5/7
  Neigh <-3
  pred=knn(train_data[,num_var], test_data[,num_var],train_data$target , k = Neigh)
  a=table(pred,test_data$target)

Error metrics for classification can be accessed through the “confusionMatrix()” function from the caret package

conf_test=confusionMatrix(pred, test_data$target, positive = "0")
conf_test
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10318   528
##          1   105    14
##                                           
##                Accuracy : 0.9423          
##                  95% CI : (0.9377, 0.9466)
##     No Information Rate : 0.9506          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.025           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98993         
##             Specificity : 0.02583         
##          Pos Pred Value : 0.95132         
##          Neg Pred Value : 0.11765         
##              Prevalence : 0.95057         
##          Detection Rate : 0.94099         
##    Detection Prevalence : 0.98915         
##       Balanced Accuracy : 0.50788         
##                                           
##        'Positive' Class : 0               
## 

Finding the F1 score since it is very important to have high precision and recall for this problem

Model Building using Random Forest and tuning

# Build the classification model using randomForest
model = randomForest(target ~ ., data=train_data, 
                      keep.forest=TRUE, ntree=50) 

# Print and understand the model
print(model)
## 
## Call:
##  randomForest(formula = target ~ ., data = train_data, keep.forest = TRUE,      ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 4.13%
## Confusion matrix:
##       0   1 class.error
## 0 24280  85 0.003488611
## 1   971 252 0.793949305

Important attributes

model$importance  
##        MeanDecreaseGini
## ID             40.18912
## Attr1          26.56679
## Attr2          22.61744
## Attr3          29.02045
## Attr4          28.61218
## Attr5          38.50798
## Attr6          45.20829
## Attr7          26.39948
## Attr8          23.70999
## Attr9          49.74943
## Attr10         23.58335
## Attr11         28.47532
## Attr12         23.86368
## Attr13         28.45474
## Attr14         24.83425
## Attr15         32.23372
## Attr16         34.57927
## Attr17         23.54452
## Attr18         21.70739
## Attr19         23.32987
## Attr20         31.25680
## Attr21         29.68573
## Attr22         34.05966
## Attr23         23.97626
## Attr24         61.71602
## Attr25         35.18235
## Attr26         33.61562
## Attr27        138.38957
## Attr28         24.29480
## Attr29         46.60052
## Attr30         28.36964
## Attr31         23.87696
## Attr32         24.83620
## Attr33         31.88914
## Attr34         83.99513
## Attr35         50.06304
## Attr36         27.82057
## Attr37         24.70612
## Attr38         30.13073
## Attr39         48.89836
## Attr40         44.46032
## Attr41         40.83784
## Attr42         41.63294
## Attr43         25.77022
## Attr44         43.71854
## Attr45         28.90882
## Attr46         78.30439
## Attr47         31.79035
## Attr48         30.48163
## Attr49         28.96734
## Attr50         28.56525
## Attr51         26.85638
## Attr52         26.87720
## Attr53         26.97061
## Attr54         27.52999
## Attr55         37.75451
## Attr56         62.71097
## Attr57         28.31858
## Attr58         63.23325
## Attr59         19.47262
## Attr60         34.29337
## Attr61         41.03418
## Attr62         24.06294
## Attr63         25.16699
## Attr64         26.52542
round(importance(model), 2)   
##        MeanDecreaseGini
## ID                40.19
## Attr1             26.57
## Attr2             22.62
## Attr3             29.02
## Attr4             28.61
## Attr5             38.51
## Attr6             45.21
## Attr7             26.40
## Attr8             23.71
## Attr9             49.75
## Attr10            23.58
## Attr11            28.48
## Attr12            23.86
## Attr13            28.45
## Attr14            24.83
## Attr15            32.23
## Attr16            34.58
## Attr17            23.54
## Attr18            21.71
## Attr19            23.33
## Attr20            31.26
## Attr21            29.69
## Attr22            34.06
## Attr23            23.98
## Attr24            61.72
## Attr25            35.18
## Attr26            33.62
## Attr27           138.39
## Attr28            24.29
## Attr29            46.60
## Attr30            28.37
## Attr31            23.88
## Attr32            24.84
## Attr33            31.89
## Attr34            84.00
## Attr35            50.06
## Attr36            27.82
## Attr37            24.71
## Attr38            30.13
## Attr39            48.90
## Attr40            44.46
## Attr41            40.84
## Attr42            41.63
## Attr43            25.77
## Attr44            43.72
## Attr45            28.91
## Attr46            78.30
## Attr47            31.79
## Attr48            30.48
## Attr49            28.97
## Attr50            28.57
## Attr51            26.86
## Attr52            26.88
## Attr53            26.97
## Attr54            27.53
## Attr55            37.75
## Attr56            62.71
## Attr57            28.32
## Attr58            63.23
## Attr59            19.47
## Attr60            34.29
## Attr61            41.03
## Attr62            24.06
## Attr63            25.17
## Attr64            26.53

Extract and store important variables obtained from the random forest model

rf_Imp_Attr = data.frame(model$importance)
rf_Imp_Attr = data.frame(row.names(rf_Imp_Attr),rf_Imp_Attr[,1])
colnames(rf_Imp_Attr) = c('Attributes', 'Importance')
rf_Imp_Attr = rf_Imp_Attr[order(rf_Imp_Attr$Importance, decreasing = TRUE),]
rf_Imp_Attr
##    Attributes Importance
## 28     Attr27  138.38957
## 35     Attr34   83.99513
## 47     Attr46   78.30439
## 59     Attr58   63.23325
## 57     Attr56   62.71097
## 25     Attr24   61.71602
## 36     Attr35   50.06304
## 10      Attr9   49.74943
## 40     Attr39   48.89836
## 30     Attr29   46.60052
## 7       Attr6   45.20829
## 41     Attr40   44.46032
## 45     Attr44   43.71854
## 43     Attr42   41.63294
## 62     Attr61   41.03418
## 42     Attr41   40.83784
## 1          ID   40.18912
## 6       Attr5   38.50798
## 56     Attr55   37.75451
## 26     Attr25   35.18235
## 17     Attr16   34.57927
## 61     Attr60   34.29337
## 23     Attr22   34.05966
## 27     Attr26   33.61562
## 16     Attr15   32.23372
## 34     Attr33   31.88914
## 48     Attr47   31.79035
## 21     Attr20   31.25680
## 49     Attr48   30.48163
## 39     Attr38   30.13073
## 22     Attr21   29.68573
## 4       Attr3   29.02045
## 50     Attr49   28.96734
## 46     Attr45   28.90882
## 5       Attr4   28.61218
## 51     Attr50   28.56525
## 12     Attr11   28.47532
## 14     Attr13   28.45474
## 31     Attr30   28.36964
## 58     Attr57   28.31858
## 37     Attr36   27.82057
## 55     Attr54   27.52999
## 54     Attr53   26.97061
## 53     Attr52   26.87720
## 52     Attr51   26.85638
## 2       Attr1   26.56679
## 65     Attr64   26.52542
## 8       Attr7   26.39948
## 44     Attr43   25.77022
## 64     Attr63   25.16699
## 33     Attr32   24.83620
## 15     Attr14   24.83425
## 38     Attr37   24.70612
## 29     Attr28   24.29480
## 63     Attr62   24.06294
## 24     Attr23   23.97626
## 32     Attr31   23.87696
## 13     Attr12   23.86368
## 9       Attr8   23.70999
## 11     Attr10   23.58335
## 18     Attr17   23.54452
## 20     Attr19   23.32987
## 3       Attr2   22.61744
## 19     Attr18   21.70739
## 60     Attr59   19.47262

plot (directly prints the important attributes)

varImpPlot(model)

Predict on Train data

pred_Train_rd = predict(model, 
                     train_data[,setdiff(names(train_data), "target")],
                     type="response", 
                     norm.votes=TRUE)

Build confusion matrix and find accuracy

cm_Train = table("actual"= train_data$target, "predicted" = pred_Train_rd);
accu_Train= sum(diag(cm_Train))/sum(cm_Train)
#rm(pred_Train_rd, cm_Train)

Predicton Test Data

pred_Test_rd = predict(model, test_data[,setdiff(names(test_data),
                                              "target")],
                    type="response", 
                    norm.votes=TRUE)

Build confusion matrix and find accuracy

cm_Test = table("actual"=test_data$target, "predicted"=pred_Test_rd);
accu_Test= sum(diag(cm_Test))/sum(cm_Test)
#rm(pred_Test, cm_Test)

Check the accuracy in train and test

accu_Train
## [1] 0.9996874
accu_Test
## [1] 0.9605107

Build randorm forest using top 9 important attributes.

top_Imp_Attr = as.character(rf_Imp_Attr$Attributes[1:9])

Build the classification model using randomForest

model_Imp = randomForest(target~.,
                         data=train_data[,c(top_Imp_Attr,"target")], 
                         keep.forest=TRUE,ntree=50) 

Important attributes

model_Imp$importance
##        MeanDecreaseGini
## Attr27         328.5925
## Attr34         385.3800
## Attr46         251.1152
## Attr58         252.9222
## Attr56         241.1253
## Attr24         228.2003
## Attr35         203.9537
## Attr9          236.9829
## Attr39         196.9732

Predict on Train data

pred_Train_rd_attr = predict(model_Imp, train_data[,top_Imp_Attr],
                     type="response", norm.votes=TRUE)

Build confusion matrix and find accuracy

cm_Train = table("actual" = train_data$target, 
                 "predicted" = pred_Train_rd_attr);
accu_Train_Imp = sum(diag(cm_Train))/sum(cm_Train)
#rm(pred_Train, cm_Train)

Predicton Test Data

pred_Test_rd_attr = predict(model_Imp, test_data[,top_Imp_Attr],
                    type="response", norm.votes=TRUE)

Build confusion matrix and find accuracy

cm_Test = table("actual" = test_data$target, 
                "predicted" = pred_Test_rd_attr);
accu_Test_Imp = sum(diag(cm_Test))/sum(cm_Test)
#rm(pred_Test, cm_Test)

Top Important attributes

top_Imp_Attr = as.character(rf_Imp_Attr$Attributes[1:9])
set.seed(123)
x <- train_data[,!(names(train_data) %in% c("target"))]
y <- train_data[,(names(train_data) %in% c("target"))]
str(y)
##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
tunedmodel <-tuneRF(x, y, ntreeTry = 50, trace=TRUE, plot=TRUE, doBest = TRUE)
## mtry = 8  OOB error = 4.01% 
## Searching left ...
## mtry = 4     OOB error = 4.61% 
## -0.1489776 0.05 
## Searching right ...
## mtry = 16    OOB error = 3.6% 
## 0.104187 0.05 
## mtry = 32    OOB error = 3.22% 
## 0.1043478 0.05 
## mtry = 64    OOB error = 3.12% 
## 0.03033981 0.05

Predict on Train data

pred_Train_rd_tune = predict(tunedmodel, train_data,
                     type="response", norm.votes=TRUE)

Build confusion matrix and find accuracy

cm_Train = table("actual" = train_data$target, 
                 "predicted" = pred_Train_rd_tune);
accu_Train = sum(diag(cm_Train))/sum(cm_Train)
#rm(pred_Train, cm_Train)

Predicton Test Data

pred_Test_rd_tune = predict(tunedmodel, test_data,
                    type="response", norm.votes=TRUE)

Build confusion matrix and find accuracy

cm_Test = table("actual" = test_data$target, 
                "predicted" = pred_Test_rd_tune);
accu_Test = sum(diag(cm_Test))/sum(cm_Test)
#rm(pred_Test, cm_Test)

Get the accuracy on train and test

accu_Train
## [1] 1
accu_Test
## [1] 0.9690834

Error metrics for classification can be accessed through the “confusionMatrix()” function from the caret package

conf_train_rd=confusionMatrix(pred_Train_rd, train_data$target, positive = "0")
conf_test_rd=confusionMatrix(pred_Test_rd, test_data$target, positive = "0")
conf_train_rd_attr=confusionMatrix(pred_Train_rd_attr, train_data$target, positive = "0")
conf_test_rd_attr=confusionMatrix(pred_Test_rd_attr, test_data$target, positive = "0")
conf_train_rd_tune=confusionMatrix(pred_Train_rd_tune, train_data$target, positive = "0")
conf_test_rd_tune=confusionMatrix(pred_Test_rd_tune, test_data$target, positive = "0")
conf_train_rd
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24365     8
##          1     0  1215
##                                           
##                Accuracy : 0.9997          
##                  95% CI : (0.9994, 0.9999)
##     No Information Rate : 0.9522          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9966          
##  Mcnemar's Test P-Value : 0.01333         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9935          
##          Pos Pred Value : 0.9997          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.9522          
##          Detection Rate : 0.9522          
##    Detection Prevalence : 0.9525          
##       Balanced Accuracy : 0.9967          
##                                           
##        'Positive' Class : 0               
## 
conf_test_rd
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10393   403
##          1    30   139
##                                           
##                Accuracy : 0.9605          
##                  95% CI : (0.9567, 0.9641)
##     No Information Rate : 0.9506          
##     P-Value [Acc > NIR] : 3.93e-07        
##                                           
##                   Kappa : 0.3763          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9971          
##             Specificity : 0.2565          
##          Pos Pred Value : 0.9627          
##          Neg Pred Value : 0.8225          
##              Prevalence : 0.9506          
##          Detection Rate : 0.9478          
##    Detection Prevalence : 0.9846          
##       Balanced Accuracy : 0.6268          
##                                           
##        'Positive' Class : 0               
## 
conf_train_rd_attr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24365     4
##          1     0  1219
##                                      
##                Accuracy : 0.9998     
##                  95% CI : (0.9996, 1)
##     No Information Rate : 0.9522     
##     P-Value [Acc > NIR] : <2e-16     
##                                      
##                   Kappa : 0.9983     
##  Mcnemar's Test P-Value : 0.1336     
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 0.9967     
##          Pos Pred Value : 0.9998     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.9522     
##          Detection Rate : 0.9522     
##    Detection Prevalence : 0.9524     
##       Balanced Accuracy : 0.9984     
##                                      
##        'Positive' Class : 0          
## 
conf_test_rd_attr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10394   352
##          1    29   190
##                                           
##                Accuracy : 0.9653          
##                  95% CI : (0.9617, 0.9686)
##     No Information Rate : 0.9506          
##     P-Value [Acc > NIR] : 4.86e-14        
##                                           
##                   Kappa : 0.4847          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9972          
##             Specificity : 0.3506          
##          Pos Pred Value : 0.9672          
##          Neg Pred Value : 0.8676          
##              Prevalence : 0.9506          
##          Detection Rate : 0.9479          
##    Detection Prevalence : 0.9800          
##       Balanced Accuracy : 0.6739          
##                                           
##        'Positive' Class : 0               
## 
conf_train_rd_tune
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24365     0
##          1     0  1223
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9999, 1)
##     No Information Rate : 0.9522     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.9522     
##          Detection Rate : 0.9522     
##    Detection Prevalence : 0.9522     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
## 
conf_test_rd_tune
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 10396   312
##          1    27   230
##                                           
##                Accuracy : 0.9691          
##                  95% CI : (0.9657, 0.9722)
##     No Information Rate : 0.9506          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5618          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9974          
##             Specificity : 0.4244          
##          Pos Pred Value : 0.9709          
##          Neg Pred Value : 0.8949          
##              Prevalence : 0.9506          
##          Detection Rate : 0.9481          
##    Detection Prevalence : 0.9766          
##       Balanced Accuracy : 0.7109          
##                                           
##        'Positive' Class : 0               
## 

Finding the F1 score since it is very important to have high precision and recall for this problem

Write the final prediction to broto_submission.csv

index_value <- data.frame(index = test_data$ID, prediction = test_data$target)
write.csv(index_value, "broto_submission_bankruptcy.csv", na="")