Loading libraries

library(funModeling)
library(vcd)
library(tidyverse)
library(corrplot)

Loading data

train <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/train.csv",na.strings=c("-1","-1.0"))
test <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/test.csv",na.strings=c("-1","-1.0"))
dataset <- bind_rows(train,test)

Exploratory dataset

class(dataset)
## [1] "data.frame"
dim(dataset)
## [1] 1488028      59
names(dataset)
##  [1] "id"             "target"         "ps_ind_01"      "ps_ind_02_cat" 
##  [5] "ps_ind_03"      "ps_ind_04_cat"  "ps_ind_05_cat"  "ps_ind_06_bin" 
##  [9] "ps_ind_07_bin"  "ps_ind_08_bin"  "ps_ind_09_bin"  "ps_ind_10_bin" 
## [13] "ps_ind_11_bin"  "ps_ind_12_bin"  "ps_ind_13_bin"  "ps_ind_14"     
## [17] "ps_ind_15"      "ps_ind_16_bin"  "ps_ind_17_bin"  "ps_ind_18_bin" 
## [21] "ps_reg_01"      "ps_reg_02"      "ps_reg_03"      "ps_car_01_cat" 
## [25] "ps_car_02_cat"  "ps_car_03_cat"  "ps_car_04_cat"  "ps_car_05_cat" 
## [29] "ps_car_06_cat"  "ps_car_07_cat"  "ps_car_08_cat"  "ps_car_09_cat" 
## [33] "ps_car_10_cat"  "ps_car_11_cat"  "ps_car_11"      "ps_car_12"     
## [37] "ps_car_13"      "ps_car_14"      "ps_car_15"      "ps_calc_01"    
## [41] "ps_calc_02"     "ps_calc_03"     "ps_calc_04"     "ps_calc_05"    
## [45] "ps_calc_06"     "ps_calc_07"     "ps_calc_08"     "ps_calc_09"    
## [49] "ps_calc_10"     "ps_calc_11"     "ps_calc_12"     "ps_calc_13"    
## [53] "ps_calc_14"     "ps_calc_15_bin" "ps_calc_16_bin" "ps_calc_17_bin"
## [57] "ps_calc_18_bin" "ps_calc_19_bin" "ps_calc_20_bin"
str(dataset)
## 'data.frame':    1488028 obs. of  59 variables:
##  $ id            : int  7 9 13 16 17 19 20 22 26 28 ...
##  $ target        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ ps_ind_01     : int  2 1 5 0 0 5 2 5 5 1 ...
##  $ ps_ind_02_cat : int  2 1 4 1 2 1 1 1 1 1 ...
##  $ ps_ind_03     : int  5 7 9 2 0 4 3 4 3 2 ...
##  $ ps_ind_04_cat : int  1 0 1 0 1 0 1 0 1 0 ...
##  $ ps_ind_05_cat : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_06_bin : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ ps_ind_07_bin : int  1 0 0 0 0 0 1 0 0 1 ...
##  $ ps_ind_08_bin : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ ps_ind_09_bin : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ ps_ind_10_bin : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_11_bin : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_12_bin : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_13_bin : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_14     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_15     : int  11 3 12 8 9 6 8 13 6 4 ...
##  $ ps_ind_16_bin : int  0 0 1 1 1 1 1 1 1 0 ...
##  $ ps_ind_17_bin : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_18_bin : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ ps_reg_01     : num  0.7 0.8 0 0.9 0.7 0.9 0.6 0.7 0.9 0.9 ...
##  $ ps_reg_02     : num  0.2 0.4 0 0.2 0.6 1.8 0.1 0.4 0.7 1.4 ...
##  $ ps_reg_03     : num  0.718 0.766 NA 0.581 0.841 ...
##  $ ps_car_01_cat : int  10 11 7 7 11 10 6 11 10 11 ...
##  $ ps_car_02_cat : int  1 1 1 1 1 0 1 1 1 0 ...
##  $ ps_car_03_cat : int  NA NA NA 0 NA NA NA 0 NA 0 ...
##  $ ps_car_04_cat : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ ps_car_05_cat : int  1 NA NA 1 NA 0 1 0 1 0 ...
##  $ ps_car_06_cat : int  4 11 14 11 14 14 11 11 14 14 ...
##  $ ps_car_07_cat : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_08_cat : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_09_cat : int  0 2 2 3 2 0 0 2 0 2 ...
##  $ ps_car_10_cat : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_11_cat : int  12 19 60 104 82 104 99 30 68 104 ...
##  $ ps_car_11     : int  2 3 1 1 3 2 2 3 3 2 ...
##  $ ps_car_12     : num  0.4 0.316 0.316 0.374 0.316 ...
##  $ ps_car_13     : num  0.884 0.619 0.642 0.543 0.566 ...
##  $ ps_car_14     : num  0.371 0.389 0.347 0.295 0.365 ...
##  $ ps_car_15     : num  3.61 2.45 3.32 2 2 ...
##  $ ps_calc_01    : num  0.6 0.3 0.5 0.6 0.4 0.7 0.2 0.1 0.9 0.7 ...
##  $ ps_calc_02    : num  0.5 0.1 0.7 0.9 0.6 0.8 0.6 0.5 0.8 0.8 ...
##  $ ps_calc_03    : num  0.2 0.3 0.1 0.1 0 0.4 0.5 0.1 0.6 0.8 ...
##  $ ps_calc_04    : int  3 2 2 2 2 3 2 1 3 2 ...
##  $ ps_calc_05    : int  1 1 2 4 2 1 2 2 1 2 ...
##  $ ps_calc_06    : int  10 9 9 7 6 8 8 7 7 8 ...
##  $ ps_calc_07    : int  1 5 1 1 3 2 1 1 3 2 ...
##  $ ps_calc_08    : int  10 8 8 8 10 11 8 6 9 9 ...
##  $ ps_calc_09    : int  1 1 2 4 2 3 3 1 4 1 ...
##  $ ps_calc_10    : int  5 7 7 2 12 8 10 13 11 11 ...
##  $ ps_calc_11    : int  9 3 4 2 3 4 3 7 4 3 ...
##  $ ps_calc_12    : int  1 1 2 2 1 2 0 1 2 5 ...
##  $ ps_calc_13    : int  5 1 7 4 1 0 0 3 1 0 ...
##  $ ps_calc_14    : int  8 9 7 9 3 9 10 6 5 6 ...
##  $ ps_calc_15_bin: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ ps_calc_16_bin: int  1 1 1 0 0 1 1 0 1 1 ...
##  $ ps_calc_17_bin: int  1 1 1 0 0 0 0 1 0 0 ...
##  $ ps_calc_18_bin: int  0 0 0 0 1 1 0 0 0 0 ...
##  $ ps_calc_19_bin: int  0 1 1 0 1 1 1 1 0 1 ...
##  $ ps_calc_20_bin: int  1 0 0 0 0 1 0 0 1 0 ...
summary(dataset)
##        id              target         ps_ind_01     ps_ind_02_cat 
##  Min.   :      0   Min.   :0        Min.   :0.000   Min.   :1.00  
##  1st Qu.: 372007   1st Qu.:0        1st Qu.:0.000   1st Qu.:1.00  
##  Median : 744014   Median :0        Median :1.000   Median :1.00  
##  Mean   : 744014   Mean   :0        Mean   :1.902   Mean   :1.36  
##  3rd Qu.:1116020   3rd Qu.:0        3rd Qu.:3.000   3rd Qu.:2.00  
##  Max.   :1488027   Max.   :1        Max.   :7.000   Max.   :4.00  
##                    NA's   :892816                   NA's   :523   
##    ps_ind_03      ps_ind_04_cat    ps_ind_05_cat   ps_ind_06_bin   
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median : 4.000   Median :0.0000   Median :0.000   Median :0.0000  
##  Mean   : 4.418   Mean   :0.4173   Mean   :0.421   Mean   :0.3934  
##  3rd Qu.: 6.000   3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.:1.0000  
##  Max.   :11.000   Max.   :1.0000   Max.   :6.000   Max.   :1.0000  
##                   NA's   :228      NA's   :14519                   
##  ps_ind_07_bin    ps_ind_08_bin    ps_ind_09_bin    ps_ind_10_bin     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.000000  
##  Mean   :0.2571   Mean   :0.1638   Mean   :0.1857   Mean   :0.000373  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000000  
##                                                                       
##  ps_ind_11_bin      ps_ind_12_bin      ps_ind_13_bin     
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.001634   Mean   :0.009401   Mean   :0.001003  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :1.000000  
##                                                          
##    ps_ind_14         ps_ind_15      ps_ind_16_bin    ps_ind_17_bin   
##  Min.   :0.00000   Min.   : 0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.: 5.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median : 7.000   Median :1.0000   Median :0.0000  
##  Mean   :0.01241   Mean   : 7.298   Mean   :0.6607   Mean   :0.1207  
##  3rd Qu.:0.00000   3rd Qu.:10.000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :4.00000   Max.   :13.000   Max.   :1.0000   Max.   :1.0000  
##                                                                      
##  ps_ind_18_bin      ps_reg_01       ps_reg_02        ps_reg_03     
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.06    
##  1st Qu.:0.0000   1st Qu.:0.400   1st Qu.:0.2000   1st Qu.:0.63    
##  Median :0.0000   Median :0.700   Median :0.3000   Median :0.80    
##  Mean   :0.1544   Mean   :0.611   Mean   :0.4396   Mean   :0.89    
##  3rd Qu.:0.0000   3rd Qu.:0.900   3rd Qu.:0.6000   3rd Qu.:1.09    
##  Max.   :1.0000   Max.   :0.900   Max.   :1.8000   Max.   :4.42    
##                                                    NA's   :269456  
##  ps_car_01_cat    ps_car_02_cat    ps_car_03_cat     ps_car_04_cat   
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.0       Min.   :0.0000  
##  1st Qu.: 7.000   1st Qu.:1.0000   1st Qu.:0.0       1st Qu.:0.0000  
##  Median : 7.000   Median :1.0000   Median :1.0       Median :0.0000  
##  Mean   : 8.295   Mean   :0.8299   Mean   :0.6       Mean   :0.7256  
##  3rd Qu.:11.000   3rd Qu.:1.0000   3rd Qu.:1.0       3rd Qu.:0.0000  
##  Max.   :11.000   Max.   :1.0000   Max.   :1.0       Max.   :9.0000  
##  NA's   :267      NA's   :10       NA's   :1028142                   
##  ps_car_05_cat    ps_car_06_cat    ps_car_07_cat   ps_car_08_cat   
##  Min.   :0.0      Min.   : 0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0      1st Qu.: 1.000   1st Qu.:1.000   1st Qu.:1.0000  
##  Median :1.0      Median : 7.000   Median :1.000   Median :1.0000  
##  Mean   :0.5      Mean   : 6.561   Mean   :0.948   Mean   :0.8322  
##  3rd Qu.:1.0      3rd Qu.:11.000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :1.0      Max.   :17.000   Max.   :1.000   Max.   :1.0000  
##  NA's   :666910                    NA's   :28820                   
##  ps_car_09_cat   ps_car_10_cat    ps_car_11_cat      ps_car_11    
##  Min.   :0.000   Min.   :0.0000   Min.   :  1.00   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:1.0000   1st Qu.: 32.00   1st Qu.:2.000  
##  Median :2.000   Median :1.0000   Median : 65.00   Median :3.000  
##  Mean   :1.331   Mean   :0.9921   Mean   : 62.26   Mean   :2.346  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 94.00   3rd Qu.:3.000  
##  Max.   :4.000   Max.   :2.0000   Max.   :104.00   Max.   :3.000  
##  NA's   :1446                                      NA's   :6      
##    ps_car_12        ps_car_13        ps_car_14        ps_car_15    
##  Min.   :0.1000   Min.   :0.2506   Min.   :0.11     Min.   :0.000  
##  1st Qu.:0.3162   1st Qu.:0.6710   1st Qu.:0.35     1st Qu.:2.828  
##  Median :0.3742   Median :0.7660   Median :0.37     Median :3.317  
##  Mean   :0.3800   Mean   :0.8135   Mean   :0.37     Mean   :3.067  
##  3rd Qu.:0.4000   3rd Qu.:0.9061   3rd Qu.:0.40     3rd Qu.:3.606  
##  Max.   :1.2649   Max.   :4.0313   Max.   :0.64     Max.   :3.742  
##  NA's   :1                         NA's   :106425                  
##    ps_calc_01       ps_calc_02       ps_calc_03     ps_calc_04   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:0.2000   1st Qu.:0.2000   1st Qu.:0.20   1st Qu.:2.000  
##  Median :0.4000   Median :0.5000   Median :0.50   Median :2.000  
##  Mean   :0.4497   Mean   :0.4501   Mean   :0.45   Mean   :2.372  
##  3rd Qu.:0.7000   3rd Qu.:0.7000   3rd Qu.:0.70   3rd Qu.:3.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :0.90   Max.   :5.000  
##                                                                  
##    ps_calc_05      ps_calc_06       ps_calc_07      ps_calc_08    
##  Min.   :0.000   Min.   : 0.000   Min.   :0.000   Min.   : 1.000  
##  1st Qu.:1.000   1st Qu.: 7.000   1st Qu.:2.000   1st Qu.: 8.000  
##  Median :2.000   Median : 8.000   Median :3.000   Median : 9.000  
##  Mean   :1.886   Mean   : 7.688   Mean   :3.008   Mean   : 9.226  
##  3rd Qu.:3.000   3rd Qu.: 9.000   3rd Qu.:4.000   3rd Qu.:10.000  
##  Max.   :6.000   Max.   :10.000   Max.   :9.000   Max.   :12.000  
##                                                                   
##    ps_calc_09      ps_calc_10       ps_calc_11      ps_calc_12    
##  Min.   :0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:1.000   1st Qu.: 6.000   1st Qu.: 4.00   1st Qu.: 1.000  
##  Median :2.000   Median : 8.000   Median : 5.00   Median : 1.000  
##  Mean   :2.339   Mean   : 8.439   Mean   : 5.44   Mean   : 1.441  
##  3rd Qu.:3.000   3rd Qu.:10.000   3rd Qu.: 7.00   3rd Qu.: 2.000  
##  Max.   :7.000   Max.   :25.000   Max.   :20.00   Max.   :11.000  
##                                                                   
##    ps_calc_13       ps_calc_14    ps_calc_15_bin   ps_calc_16_bin  
##  Min.   : 0.000   Min.   : 0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 2.000   1st Qu.: 6.00   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 3.000   Median : 7.00   Median :0.0000   Median :1.0000  
##  Mean   : 2.874   Mean   : 7.54   Mean   :0.1232   Mean   :0.6278  
##  3rd Qu.: 4.000   3rd Qu.: 9.00   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :15.000   Max.   :28.00   Max.   :1.0000   Max.   :1.0000  
##                                                                    
##  ps_calc_17_bin   ps_calc_18_bin   ps_calc_19_bin   ps_calc_20_bin  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.5545   Mean   :0.2876   Mean   :0.3492   Mean   :0.1528  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
## 
glimpse(dataset)
## Observations: 1,488,028
## Variables: 59
## $ id             <int> 7, 9, 13, 16, 17, 19, 20, 22, 26, 28, 34, 35, 3...
## $ target         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_01      <int> 2, 1, 5, 0, 0, 5, 2, 5, 5, 1, 5, 2, 2, 1, 5, 5,...
## $ ps_ind_02_cat  <int> 2, 1, 4, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,...
## $ ps_ind_03      <int> 5, 7, 9, 2, 0, 4, 3, 4, 3, 2, 2, 3, 1, 3, 11, 3...
## $ ps_ind_04_cat  <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,...
## $ ps_ind_05_cat  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_06_bin  <int> 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_07_bin  <int> 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,...
## $ ps_ind_08_bin  <int> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,...
## $ ps_ind_09_bin  <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ ps_ind_10_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_11_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_12_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_13_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_14      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_15      <int> 11, 3, 12, 8, 9, 6, 8, 13, 6, 4, 3, 9, 10, 12, ...
## $ ps_ind_16_bin  <int> 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ ps_ind_17_bin  <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_18_bin  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ ps_reg_01      <dbl> 0.7, 0.8, 0.0, 0.9, 0.7, 0.9, 0.6, 0.7, 0.9, 0....
## $ ps_reg_02      <dbl> 0.2, 0.4, 0.0, 0.2, 0.6, 1.8, 0.1, 0.4, 0.7, 1....
## $ ps_reg_03      <dbl> 0.7180703, 0.7660777, NA, 0.5809475, 0.8407586,...
## $ ps_car_01_cat  <int> 10, 11, 7, 7, 11, 10, 6, 11, 10, 11, 11, 11, 6,...
## $ ps_car_02_cat  <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,...
## $ ps_car_03_cat  <int> NA, NA, NA, 0, NA, NA, NA, 0, NA, 0, NA, NA, NA...
## $ ps_car_04_cat  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 9,...
## $ ps_car_05_cat  <int> 1, NA, NA, 1, NA, 0, 1, 0, 1, 0, NA, NA, NA, 1,...
## $ ps_car_06_cat  <int> 4, 11, 14, 11, 14, 14, 11, 11, 14, 14, 13, 11, ...
## $ ps_car_07_cat  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ ps_car_08_cat  <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,...
## $ ps_car_09_cat  <int> 0, 2, 2, 3, 2, 0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0,...
## $ ps_car_10_cat  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ ps_car_11_cat  <int> 12, 19, 60, 104, 82, 104, 99, 30, 68, 104, 20, ...
## $ ps_car_11      <int> 2, 3, 1, 1, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 1, 2,...
## $ ps_car_12      <dbl> 0.4000000, 0.3162278, 0.3162278, 0.3741657, 0.3...
## $ ps_car_13      <dbl> 0.8836789, 0.6188165, 0.6415857, 0.5429488, 0.5...
## $ ps_car_14      <dbl> 0.3708099, 0.3887158, 0.3472751, 0.2949576, 0.3...
## $ ps_car_15      <dbl> 3.605551, 2.449490, 3.316625, 2.000000, 2.00000...
## $ ps_calc_01     <dbl> 0.6, 0.3, 0.5, 0.6, 0.4, 0.7, 0.2, 0.1, 0.9, 0....
## $ ps_calc_02     <dbl> 0.5, 0.1, 0.7, 0.9, 0.6, 0.8, 0.6, 0.5, 0.8, 0....
## $ ps_calc_03     <dbl> 0.2, 0.3, 0.1, 0.1, 0.0, 0.4, 0.5, 0.1, 0.6, 0....
## $ ps_calc_04     <int> 3, 2, 2, 2, 2, 3, 2, 1, 3, 2, 2, 2, 4, 2, 3, 2,...
## $ ps_calc_05     <int> 1, 1, 2, 4, 2, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 1,...
## $ ps_calc_06     <int> 10, 9, 9, 7, 6, 8, 8, 7, 7, 8, 8, 8, 8, 10, 8, ...
## $ ps_calc_07     <int> 1, 5, 1, 1, 3, 2, 1, 1, 3, 2, 2, 2, 4, 1, 2, 5,...
## $ ps_calc_08     <int> 10, 8, 8, 8, 10, 11, 8, 6, 9, 9, 9, 10, 11, 8, ...
## $ ps_calc_09     <int> 1, 1, 2, 4, 2, 3, 3, 1, 4, 1, 4, 1, 1, 3, 3, 2,...
## $ ps_calc_10     <int> 5, 7, 7, 2, 12, 8, 10, 13, 11, 11, 7, 8, 9, 8, ...
## $ ps_calc_11     <int> 9, 3, 4, 2, 3, 4, 3, 7, 4, 3, 6, 9, 6, 2, 4, 5,...
## $ ps_calc_12     <int> 1, 1, 2, 2, 1, 2, 0, 1, 2, 5, 3, 2, 3, 0, 1, 2,...
## $ ps_calc_13     <int> 5, 1, 7, 4, 1, 0, 0, 3, 1, 0, 3, 1, 3, 4, 3, 6,...
## $ ps_calc_14     <int> 8, 9, 7, 9, 3, 9, 10, 6, 5, 6, 6, 10, 8, 3, 9, ...
## $ ps_calc_15_bin <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_calc_16_bin <int> 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,...
## $ ps_calc_17_bin <int> 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,...
## $ ps_calc_18_bin <int> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ ps_calc_19_bin <int> 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,...
## $ ps_calc_20_bin <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,...

Univariate visualization

Variables are splitted both by suffix and by type loaded
For categorical variables (suffix_cat, suffix_bin, integer type) are used a barplots
For numerical variables are used histograms to have an idea about distribution shape and boxplots to watch skewness, outliers

Missing values

pMiss <- function(x){sum(is.na(x))/length(x)*100}
pMiss <- apply(dataset[,-2],2,pMiss)
pMiss <- pMiss[pMiss > 0]
pMiss <- pMiss[order(pMiss, decreasing=T)]
pMiss
## ps_car_03_cat ps_car_05_cat     ps_reg_03     ps_car_14 ps_car_07_cat 
##  6.909426e+01  4.481838e+01  1.810826e+01  7.152083e+00  1.936792e+00 
## ps_ind_05_cat ps_car_09_cat ps_ind_02_cat ps_car_01_cat ps_ind_04_cat 
##  9.757209e-01  9.717559e-02  3.514719e-02  1.794321e-02  1.532229e-02 
## ps_car_02_cat     ps_car_11     ps_car_12 
##  6.720304e-04  4.032182e-04  6.720304e-05
barplot(pMiss, col=4, las=3, main="Features with Missing Values")

Target variable (outcome variable)

barplot(table(dataset[2]), main=colnames(dataset[2]), col=c(4,5))

Binary variables

par(mfrow=c(3,3))
barplot(table(dataset[8]), main=colnames(dataset[8]), col=c(4,5))
barplot(table(dataset[9]), main=colnames(dataset[9]), col=c(4,5))
barplot(table(dataset[10]), main=colnames(dataset[10]), col=c(4,5))
barplot(table(dataset[11]), main=colnames(dataset[11]), col=c(4,5))
barplot(table(dataset[12]), main=colnames(dataset[12]), col=c(4,5))
barplot(table(dataset[13]), main=colnames(dataset[13]), col=c(4,5))
barplot(table(dataset[14]), main=colnames(dataset[14]), col=c(4,5))
barplot(table(dataset[15]), main=colnames(dataset[15]), col=c(4,5))
barplot(table(dataset[18]), main=colnames(dataset[18]), col=c(4,5))

par(mfrow=c(3,3))
barplot(table(dataset[19]), main=colnames(dataset[19]), col=c(4,5))
barplot(table(dataset[20]), main=colnames(dataset[20]), col=c(4,5))
barplot(table(dataset[54]), main=colnames(dataset[54]), col=c(4,5))
barplot(table(dataset[55]), main=colnames(dataset[55]), col=c(4,5))
barplot(table(dataset[56]), main=colnames(dataset[56]), col=c(4,5))
barplot(table(dataset[57]), main=colnames(dataset[57]), col=c(4,5))
barplot(table(dataset[58]), main=colnames(dataset[58]), col=c(4,5))
barplot(table(dataset[59]), main=colnames(dataset[59]), col=c(4,5))

Categorical variables

par(mfrow=c(3,3))
barplot(table(dataset[4]), main=colnames(dataset[4]), col=c(1:8))
barplot(table(dataset[6]), main=colnames(dataset[6]), col=c(1:8))
barplot(table(dataset[7]), main=colnames(dataset[7]), col=c(1:8))
barplot(table(dataset[24]), main=colnames(dataset[24]), col=c(1:8))
barplot(table(dataset[25]), main=colnames(dataset[25]), col=c(1:8))
barplot(table(dataset[26]), main=colnames(dataset[26]), col=c(1:8))
barplot(table(dataset[27]), main=colnames(dataset[27]), col=c(1:8))
barplot(table(dataset[28]), main=colnames(dataset[28]), col=c(1:8))
barplot(table(dataset[29]), main=colnames(dataset[29]), col=c(1:8))

par(mfrow=c(3,2))
barplot(table(dataset[30]), main=colnames(dataset[30]), col=c(1:8))
barplot(table(dataset[31]), main=colnames(dataset[31]), col=c(1:8))
barplot(table(dataset[32]), main=colnames(dataset[32]), col=c(1:8))
barplot(table(dataset[33]), main=colnames(dataset[33]), col=c(1:8))
barplot(table(dataset[34]), main=colnames(dataset[34]), col=c(1:8))

Integer variables

par(mfrow=c(3,3))
barplot(table(dataset[3]), main=colnames(dataset[3]), col=c(1:8))
barplot(table(dataset[5]), main=colnames(dataset[5]), col=c(1:8))
barplot(table(dataset[16]), main=colnames(dataset[16]), col=c(1:8))
barplot(table(dataset[17]), main=colnames(dataset[17]), col=c(1:8))
barplot(table(dataset[35]), main=colnames(dataset[35]), col=c(1:8))
barplot(table(dataset[43]), main=colnames(dataset[43]), col=c(1:8))
barplot(table(dataset[44]), main=colnames(dataset[44]), col=c(1:8))
barplot(table(dataset[45]), main=colnames(dataset[45]), col=c(1:8))
barplot(table(dataset[46]), main=colnames(dataset[46]), col=c(1:8))

par(mfrow=c(3,3))
barplot(table(dataset[47]), main=colnames(dataset[47]), col=c(1:8))
barplot(table(dataset[48]), main=colnames(dataset[48]), col=c(1:8))
barplot(table(dataset[49]), main=colnames(dataset[49]), col=c(1:8))
barplot(table(dataset[50]), main=colnames(dataset[50]), col=c(1:8))
barplot(table(dataset[51]), main=colnames(dataset[51]), col=c(1:8))
barplot(table(dataset[52]), main=colnames(dataset[52]), col=c(1:8))
barplot(table(dataset[53]), main=colnames(dataset[53]), col=c(1:8))

Numerical variables

boxplot(dataset[,c(21:23,36:42)], col=3, las=3)

par(mfrow=c(3,2))
hist(dataset[,21], main=colnames(dataset[21]), col=4)
hist(dataset[,22], main=colnames(dataset[22]), col=4)
hist(dataset[,23], main=colnames(dataset[23]), col=4)
hist(dataset[,36], main=colnames(dataset[36]), col=4)
hist(dataset[,37], main=colnames(dataset[37]), col=4)
hist(dataset[,38], main=colnames(dataset[38]), col=4)

par(mfrow=c(2,2))
hist(dataset[,39], main=colnames(dataset[39]), col=4)
hist(dataset[,40], main=colnames(dataset[40]), col=4)
hist(dataset[,41], main=colnames(dataset[41]), col=4)
hist(dataset[,42], main=colnames(dataset[42]), col=4)

Bivariate visualization

Variables are splitted both by suffix and by type loaded
For categorical variables are used a mosaic plots to watch a relationship between explanatory variables and outcome variable
Looking at the legend, blue rectangles are the ones most likely to happen with the highest Pearson residuals

Binary variables

par(mfrow=c(2,2))
mosaicplot(dataset[,8]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[8]),shade=T,legend=T) 
mosaicplot(dataset[,9]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[9]),shade=T,legend=T) 
mosaicplot(dataset[,10]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[10]),shade=T,legend=T) 
mosaicplot(dataset[,11]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[11]),shade=T,legend=T) 

par(mfrow=c(2,2))
mosaicplot(dataset[,12]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[12]),shade=T,legend=T) 
mosaicplot(dataset[,13]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[13]),shade=T,legend=T) 
mosaicplot(dataset[,14]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[14]),shade=T,legend=T) 
mosaicplot(dataset[,15]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[15]),shade=T,legend=T) 

par(mfrow=c(2,2))
mosaicplot(dataset[,18]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[18]),shade=T,legend=T) 
mosaicplot(dataset[,19]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[19]),shade=T,legend=T) 
mosaicplot(dataset[,20]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[20]),shade=T,legend=T) 
mosaicplot(dataset[,54]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[54]),shade=T,legend=T) 

par(mfrow=c(2,2))
mosaicplot(dataset[,55]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[55]),shade=T,legend=T) 
mosaicplot(dataset[,56]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[56]),shade=T,legend=T) 
mosaicplot(dataset[,57]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[57]),shade=T,legend=T) 
mosaicplot(dataset[,58]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[58]),shade=T,legend=T) 

mosaicplot(dataset[,59]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[59]),shade=T,legend=T) 

Categorical variables

par(mfrow=c(2,2))
mosaicplot(dataset[,4]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[4]),shade=T,legend=T) 
mosaicplot(dataset[,6]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[6]),shade=T,legend=T) 
mosaicplot(dataset[,7]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[7]),shade=T,legend=T) 
mosaicplot(dataset[,24]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[24]),shade=T,legend=T) 

par(mfrow=c(2,2))
mosaicplot(dataset[,25]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[25]),shade=T,legend=T) 
mosaicplot(dataset[,26]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[26]),shade=T,legend=T) 
mosaicplot(dataset[,27]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[27]),shade=T,legend=T) 
mosaicplot(dataset[,28]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[28]),shade=T,legend=T) 

par(mfrow=c(2,2))
mosaicplot(dataset[,29]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[29]),shade=T,legend=T) 
mosaicplot(dataset[,30]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[30]),shade=T,legend=T) 
mosaicplot(dataset[,31]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[31]),shade=T,legend=T) 
mosaicplot(dataset[,32]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[32]),shade=T,legend=T) 

mosaicplot(dataset[,33]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[33]),shade=T,legend=T) 
mosaicplot(dataset[,34]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
           xlab=colnames(dataset[34]),shade=T,legend=T) 

Integer Variables

par(mfrow=c(3,3))
mosaicplot(dataset[,3]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[3]),shade=T,legend=T) 
mosaicplot(dataset[,5]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[5]),shade=T,legend=T) 
mosaicplot(dataset[,16]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[16]),shade=T,legend=T) 
mosaicplot(dataset[,17]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[17]),shade=T,legend=T) 
mosaicplot(dataset[,35]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[35]),shade=T,legend=T) 
mosaicplot(dataset[,43]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[43]),shade=T,legend=T) 
mosaicplot(dataset[,44]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[44]),shade=T,legend=T) 
mosaicplot(dataset[,45]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[45]),shade=T,legend=T) 
mosaicplot(dataset[,46]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[46]),shade=T,legend=T) 

par(mfrow=c(3,3))
mosaicplot(dataset[,47]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[47]),shade=T,legend=T) 
mosaicplot(dataset[,48]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[48]),shade=T,legend=T) 
mosaicplot(dataset[,49]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[49]),shade=T,legend=T) 
mosaicplot(dataset[,50]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[50]),shade=T,legend=T) 
mosaicplot(dataset[,51]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[51]),shade=T,legend=T) 
mosaicplot(dataset[,52]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[52]),shade=T,legend=T) 
mosaicplot(dataset[,53]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[53]),shade=T,legend=T) 

Numerical variables

For numerical variables are used cross plots, similar to histogram but its afford to watch the behaviour of the object variable
on the predicted outcome
cross_plot(dataset[, c('ps_reg_01', 'target')], 
           str_input=, str_target="target", plot_type='quantity')

cross_plot(dataset[, c('ps_reg_02', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')

cross_plot(dataset[, c('ps_reg_03', 'target')],
           str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_reg_03' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"

cross_plot(dataset[, c('ps_car_12', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_car_12' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"

cross_plot(dataset[, c('ps_car_13', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_car_13' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"

cross_plot(dataset[, c('ps_car_14', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_car_14' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"

cross_plot(dataset[, c('ps_car_15', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')

cross_plot(dataset[, c('ps_calc_01', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')

cross_plot(dataset[, c('ps_calc_02', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')

cross_plot(dataset[, c('ps_calc_03', 'target')], 
           str_input=, str_target="target", plot_type= 'quantity')

Correlation between numerical variables

num <- dataset[,c(21:23,36:42)]
num %>%
cor(use="complete.obs",method = "spearman") %>%
corrplot(type="lower", tl.col = "black", diag=FALSE)