library(funModeling)
library(vcd)
library(tidyverse)
library(corrplot)
train <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/train.csv",na.strings=c("-1","-1.0"))
test <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/test.csv",na.strings=c("-1","-1.0"))
dataset <- bind_rows(train,test)
class(dataset)
## [1] "data.frame"
dim(dataset)
## [1] 1488028 59
names(dataset)
## [1] "id" "target" "ps_ind_01" "ps_ind_02_cat"
## [5] "ps_ind_03" "ps_ind_04_cat" "ps_ind_05_cat" "ps_ind_06_bin"
## [9] "ps_ind_07_bin" "ps_ind_08_bin" "ps_ind_09_bin" "ps_ind_10_bin"
## [13] "ps_ind_11_bin" "ps_ind_12_bin" "ps_ind_13_bin" "ps_ind_14"
## [17] "ps_ind_15" "ps_ind_16_bin" "ps_ind_17_bin" "ps_ind_18_bin"
## [21] "ps_reg_01" "ps_reg_02" "ps_reg_03" "ps_car_01_cat"
## [25] "ps_car_02_cat" "ps_car_03_cat" "ps_car_04_cat" "ps_car_05_cat"
## [29] "ps_car_06_cat" "ps_car_07_cat" "ps_car_08_cat" "ps_car_09_cat"
## [33] "ps_car_10_cat" "ps_car_11_cat" "ps_car_11" "ps_car_12"
## [37] "ps_car_13" "ps_car_14" "ps_car_15" "ps_calc_01"
## [41] "ps_calc_02" "ps_calc_03" "ps_calc_04" "ps_calc_05"
## [45] "ps_calc_06" "ps_calc_07" "ps_calc_08" "ps_calc_09"
## [49] "ps_calc_10" "ps_calc_11" "ps_calc_12" "ps_calc_13"
## [53] "ps_calc_14" "ps_calc_15_bin" "ps_calc_16_bin" "ps_calc_17_bin"
## [57] "ps_calc_18_bin" "ps_calc_19_bin" "ps_calc_20_bin"
str(dataset)
## 'data.frame': 1488028 obs. of 59 variables:
## $ id : int 7 9 13 16 17 19 20 22 26 28 ...
## $ target : int 0 0 0 0 0 0 0 0 0 1 ...
## $ ps_ind_01 : int 2 1 5 0 0 5 2 5 5 1 ...
## $ ps_ind_02_cat : int 2 1 4 1 2 1 1 1 1 1 ...
## $ ps_ind_03 : int 5 7 9 2 0 4 3 4 3 2 ...
## $ ps_ind_04_cat : int 1 0 1 0 1 0 1 0 1 0 ...
## $ ps_ind_05_cat : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_06_bin : int 0 0 0 1 1 0 0 1 0 0 ...
## $ ps_ind_07_bin : int 1 0 0 0 0 0 1 0 0 1 ...
## $ ps_ind_08_bin : int 0 1 1 0 0 0 0 0 1 0 ...
## $ ps_ind_09_bin : int 0 0 0 0 0 1 0 0 0 0 ...
## $ ps_ind_10_bin : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_11_bin : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_12_bin : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_13_bin : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_14 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_15 : int 11 3 12 8 9 6 8 13 6 4 ...
## $ ps_ind_16_bin : int 0 0 1 1 1 1 1 1 1 0 ...
## $ ps_ind_17_bin : int 1 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_18_bin : int 0 1 0 0 0 0 0 0 0 1 ...
## $ ps_reg_01 : num 0.7 0.8 0 0.9 0.7 0.9 0.6 0.7 0.9 0.9 ...
## $ ps_reg_02 : num 0.2 0.4 0 0.2 0.6 1.8 0.1 0.4 0.7 1.4 ...
## $ ps_reg_03 : num 0.718 0.766 NA 0.581 0.841 ...
## $ ps_car_01_cat : int 10 11 7 7 11 10 6 11 10 11 ...
## $ ps_car_02_cat : int 1 1 1 1 1 0 1 1 1 0 ...
## $ ps_car_03_cat : int NA NA NA 0 NA NA NA 0 NA 0 ...
## $ ps_car_04_cat : int 0 0 0 0 0 0 0 0 0 1 ...
## $ ps_car_05_cat : int 1 NA NA 1 NA 0 1 0 1 0 ...
## $ ps_car_06_cat : int 4 11 14 11 14 14 11 11 14 14 ...
## $ ps_car_07_cat : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_08_cat : int 0 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_09_cat : int 0 2 2 3 2 0 0 2 0 2 ...
## $ ps_car_10_cat : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_11_cat : int 12 19 60 104 82 104 99 30 68 104 ...
## $ ps_car_11 : int 2 3 1 1 3 2 2 3 3 2 ...
## $ ps_car_12 : num 0.4 0.316 0.316 0.374 0.316 ...
## $ ps_car_13 : num 0.884 0.619 0.642 0.543 0.566 ...
## $ ps_car_14 : num 0.371 0.389 0.347 0.295 0.365 ...
## $ ps_car_15 : num 3.61 2.45 3.32 2 2 ...
## $ ps_calc_01 : num 0.6 0.3 0.5 0.6 0.4 0.7 0.2 0.1 0.9 0.7 ...
## $ ps_calc_02 : num 0.5 0.1 0.7 0.9 0.6 0.8 0.6 0.5 0.8 0.8 ...
## $ ps_calc_03 : num 0.2 0.3 0.1 0.1 0 0.4 0.5 0.1 0.6 0.8 ...
## $ ps_calc_04 : int 3 2 2 2 2 3 2 1 3 2 ...
## $ ps_calc_05 : int 1 1 2 4 2 1 2 2 1 2 ...
## $ ps_calc_06 : int 10 9 9 7 6 8 8 7 7 8 ...
## $ ps_calc_07 : int 1 5 1 1 3 2 1 1 3 2 ...
## $ ps_calc_08 : int 10 8 8 8 10 11 8 6 9 9 ...
## $ ps_calc_09 : int 1 1 2 4 2 3 3 1 4 1 ...
## $ ps_calc_10 : int 5 7 7 2 12 8 10 13 11 11 ...
## $ ps_calc_11 : int 9 3 4 2 3 4 3 7 4 3 ...
## $ ps_calc_12 : int 1 1 2 2 1 2 0 1 2 5 ...
## $ ps_calc_13 : int 5 1 7 4 1 0 0 3 1 0 ...
## $ ps_calc_14 : int 8 9 7 9 3 9 10 6 5 6 ...
## $ ps_calc_15_bin: int 0 0 0 0 0 0 0 1 0 0 ...
## $ ps_calc_16_bin: int 1 1 1 0 0 1 1 0 1 1 ...
## $ ps_calc_17_bin: int 1 1 1 0 0 0 0 1 0 0 ...
## $ ps_calc_18_bin: int 0 0 0 0 1 1 0 0 0 0 ...
## $ ps_calc_19_bin: int 0 1 1 0 1 1 1 1 0 1 ...
## $ ps_calc_20_bin: int 1 0 0 0 0 1 0 0 1 0 ...
summary(dataset)
## id target ps_ind_01 ps_ind_02_cat
## Min. : 0 Min. :0 Min. :0.000 Min. :1.00
## 1st Qu.: 372007 1st Qu.:0 1st Qu.:0.000 1st Qu.:1.00
## Median : 744014 Median :0 Median :1.000 Median :1.00
## Mean : 744014 Mean :0 Mean :1.902 Mean :1.36
## 3rd Qu.:1116020 3rd Qu.:0 3rd Qu.:3.000 3rd Qu.:2.00
## Max. :1488027 Max. :1 Max. :7.000 Max. :4.00
## NA's :892816 NA's :523
## ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin
## Min. : 0.000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median : 4.000 Median :0.0000 Median :0.000 Median :0.0000
## Mean : 4.418 Mean :0.4173 Mean :0.421 Mean :0.3934
## 3rd Qu.: 6.000 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :11.000 Max. :1.0000 Max. :6.000 Max. :1.0000
## NA's :228 NA's :14519
## ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.000000
## Mean :0.2571 Mean :0.1638 Mean :0.1857 Mean :0.000373
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000000
##
## ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin
## Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.001634 Mean :0.009401 Mean :0.001003
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.000000 Max. :1.000000
##
## ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin
## Min. :0.00000 Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 5.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median : 7.000 Median :1.0000 Median :0.0000
## Mean :0.01241 Mean : 7.298 Mean :0.6607 Mean :0.1207
## 3rd Qu.:0.00000 3rd Qu.:10.000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :4.00000 Max. :13.000 Max. :1.0000 Max. :1.0000
##
## ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.06
## 1st Qu.:0.0000 1st Qu.:0.400 1st Qu.:0.2000 1st Qu.:0.63
## Median :0.0000 Median :0.700 Median :0.3000 Median :0.80
## Mean :0.1544 Mean :0.611 Mean :0.4396 Mean :0.89
## 3rd Qu.:0.0000 3rd Qu.:0.900 3rd Qu.:0.6000 3rd Qu.:1.09
## Max. :1.0000 Max. :0.900 Max. :1.8000 Max. :4.42
## NA's :269456
## ps_car_01_cat ps_car_02_cat ps_car_03_cat ps_car_04_cat
## Min. : 0.000 Min. :0.0000 Min. :0.0 Min. :0.0000
## 1st Qu.: 7.000 1st Qu.:1.0000 1st Qu.:0.0 1st Qu.:0.0000
## Median : 7.000 Median :1.0000 Median :1.0 Median :0.0000
## Mean : 8.295 Mean :0.8299 Mean :0.6 Mean :0.7256
## 3rd Qu.:11.000 3rd Qu.:1.0000 3rd Qu.:1.0 3rd Qu.:0.0000
## Max. :11.000 Max. :1.0000 Max. :1.0 Max. :9.0000
## NA's :267 NA's :10 NA's :1028142
## ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat
## Min. :0.0 Min. : 0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:1.0000
## Median :1.0 Median : 7.000 Median :1.000 Median :1.0000
## Mean :0.5 Mean : 6.561 Mean :0.948 Mean :0.8322
## 3rd Qu.:1.0 3rd Qu.:11.000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.0 Max. :17.000 Max. :1.000 Max. :1.0000
## NA's :666910 NA's :28820
## ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11
## Min. :0.000 Min. :0.0000 Min. : 1.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:1.0000 1st Qu.: 32.00 1st Qu.:2.000
## Median :2.000 Median :1.0000 Median : 65.00 Median :3.000
## Mean :1.331 Mean :0.9921 Mean : 62.26 Mean :2.346
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 94.00 3rd Qu.:3.000
## Max. :4.000 Max. :2.0000 Max. :104.00 Max. :3.000
## NA's :1446 NA's :6
## ps_car_12 ps_car_13 ps_car_14 ps_car_15
## Min. :0.1000 Min. :0.2506 Min. :0.11 Min. :0.000
## 1st Qu.:0.3162 1st Qu.:0.6710 1st Qu.:0.35 1st Qu.:2.828
## Median :0.3742 Median :0.7660 Median :0.37 Median :3.317
## Mean :0.3800 Mean :0.8135 Mean :0.37 Mean :3.067
## 3rd Qu.:0.4000 3rd Qu.:0.9061 3rd Qu.:0.40 3rd Qu.:3.606
## Max. :1.2649 Max. :4.0313 Max. :0.64 Max. :3.742
## NA's :1 NA's :106425
## ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04
## Min. :0.0000 Min. :0.0000 Min. :0.00 Min. :0.000
## 1st Qu.:0.2000 1st Qu.:0.2000 1st Qu.:0.20 1st Qu.:2.000
## Median :0.4000 Median :0.5000 Median :0.50 Median :2.000
## Mean :0.4497 Mean :0.4501 Mean :0.45 Mean :2.372
## 3rd Qu.:0.7000 3rd Qu.:0.7000 3rd Qu.:0.70 3rd Qu.:3.000
## Max. :0.9000 Max. :0.9000 Max. :0.90 Max. :5.000
##
## ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08
## Min. :0.000 Min. : 0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:1.000 1st Qu.: 7.000 1st Qu.:2.000 1st Qu.: 8.000
## Median :2.000 Median : 8.000 Median :3.000 Median : 9.000
## Mean :1.886 Mean : 7.688 Mean :3.008 Mean : 9.226
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.:4.000 3rd Qu.:10.000
## Max. :6.000 Max. :10.000 Max. :9.000 Max. :12.000
##
## ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12
## Min. :0.000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.:1.000 1st Qu.: 6.000 1st Qu.: 4.00 1st Qu.: 1.000
## Median :2.000 Median : 8.000 Median : 5.00 Median : 1.000
## Mean :2.339 Mean : 8.439 Mean : 5.44 Mean : 1.441
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.00 3rd Qu.: 2.000
## Max. :7.000 Max. :25.000 Max. :20.00 Max. :11.000
##
## ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin
## Min. : 0.000 Min. : 0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.: 6.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 3.000 Median : 7.00 Median :0.0000 Median :1.0000
## Mean : 2.874 Mean : 7.54 Mean :0.1232 Mean :0.6278
## 3rd Qu.: 4.000 3rd Qu.: 9.00 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :15.000 Max. :28.00 Max. :1.0000 Max. :1.0000
##
## ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.5545 Mean :0.2876 Mean :0.3492 Mean :0.1528
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
glimpse(dataset)
## Observations: 1,488,028
## Variables: 59
## $ id <int> 7, 9, 13, 16, 17, 19, 20, 22, 26, 28, 34, 35, 3...
## $ target <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_01 <int> 2, 1, 5, 0, 0, 5, 2, 5, 5, 1, 5, 2, 2, 1, 5, 5,...
## $ ps_ind_02_cat <int> 2, 1, 4, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,...
## $ ps_ind_03 <int> 5, 7, 9, 2, 0, 4, 3, 4, 3, 2, 2, 3, 1, 3, 11, 3...
## $ ps_ind_04_cat <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,...
## $ ps_ind_05_cat <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_06_bin <int> 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_07_bin <int> 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,...
## $ ps_ind_08_bin <int> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,...
## $ ps_ind_09_bin <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ ps_ind_10_bin <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_11_bin <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_12_bin <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_13_bin <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_14 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_15 <int> 11, 3, 12, 8, 9, 6, 8, 13, 6, 4, 3, 9, 10, 12, ...
## $ ps_ind_16_bin <int> 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ ps_ind_17_bin <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_18_bin <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ ps_reg_01 <dbl> 0.7, 0.8, 0.0, 0.9, 0.7, 0.9, 0.6, 0.7, 0.9, 0....
## $ ps_reg_02 <dbl> 0.2, 0.4, 0.0, 0.2, 0.6, 1.8, 0.1, 0.4, 0.7, 1....
## $ ps_reg_03 <dbl> 0.7180703, 0.7660777, NA, 0.5809475, 0.8407586,...
## $ ps_car_01_cat <int> 10, 11, 7, 7, 11, 10, 6, 11, 10, 11, 11, 11, 6,...
## $ ps_car_02_cat <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,...
## $ ps_car_03_cat <int> NA, NA, NA, 0, NA, NA, NA, 0, NA, 0, NA, NA, NA...
## $ ps_car_04_cat <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 9,...
## $ ps_car_05_cat <int> 1, NA, NA, 1, NA, 0, 1, 0, 1, 0, NA, NA, NA, 1,...
## $ ps_car_06_cat <int> 4, 11, 14, 11, 14, 14, 11, 11, 14, 14, 13, 11, ...
## $ ps_car_07_cat <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ ps_car_08_cat <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,...
## $ ps_car_09_cat <int> 0, 2, 2, 3, 2, 0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0,...
## $ ps_car_10_cat <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ ps_car_11_cat <int> 12, 19, 60, 104, 82, 104, 99, 30, 68, 104, 20, ...
## $ ps_car_11 <int> 2, 3, 1, 1, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 1, 2,...
## $ ps_car_12 <dbl> 0.4000000, 0.3162278, 0.3162278, 0.3741657, 0.3...
## $ ps_car_13 <dbl> 0.8836789, 0.6188165, 0.6415857, 0.5429488, 0.5...
## $ ps_car_14 <dbl> 0.3708099, 0.3887158, 0.3472751, 0.2949576, 0.3...
## $ ps_car_15 <dbl> 3.605551, 2.449490, 3.316625, 2.000000, 2.00000...
## $ ps_calc_01 <dbl> 0.6, 0.3, 0.5, 0.6, 0.4, 0.7, 0.2, 0.1, 0.9, 0....
## $ ps_calc_02 <dbl> 0.5, 0.1, 0.7, 0.9, 0.6, 0.8, 0.6, 0.5, 0.8, 0....
## $ ps_calc_03 <dbl> 0.2, 0.3, 0.1, 0.1, 0.0, 0.4, 0.5, 0.1, 0.6, 0....
## $ ps_calc_04 <int> 3, 2, 2, 2, 2, 3, 2, 1, 3, 2, 2, 2, 4, 2, 3, 2,...
## $ ps_calc_05 <int> 1, 1, 2, 4, 2, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 1,...
## $ ps_calc_06 <int> 10, 9, 9, 7, 6, 8, 8, 7, 7, 8, 8, 8, 8, 10, 8, ...
## $ ps_calc_07 <int> 1, 5, 1, 1, 3, 2, 1, 1, 3, 2, 2, 2, 4, 1, 2, 5,...
## $ ps_calc_08 <int> 10, 8, 8, 8, 10, 11, 8, 6, 9, 9, 9, 10, 11, 8, ...
## $ ps_calc_09 <int> 1, 1, 2, 4, 2, 3, 3, 1, 4, 1, 4, 1, 1, 3, 3, 2,...
## $ ps_calc_10 <int> 5, 7, 7, 2, 12, 8, 10, 13, 11, 11, 7, 8, 9, 8, ...
## $ ps_calc_11 <int> 9, 3, 4, 2, 3, 4, 3, 7, 4, 3, 6, 9, 6, 2, 4, 5,...
## $ ps_calc_12 <int> 1, 1, 2, 2, 1, 2, 0, 1, 2, 5, 3, 2, 3, 0, 1, 2,...
## $ ps_calc_13 <int> 5, 1, 7, 4, 1, 0, 0, 3, 1, 0, 3, 1, 3, 4, 3, 6,...
## $ ps_calc_14 <int> 8, 9, 7, 9, 3, 9, 10, 6, 5, 6, 6, 10, 8, 3, 9, ...
## $ ps_calc_15_bin <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_calc_16_bin <int> 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,...
## $ ps_calc_17_bin <int> 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,...
## $ ps_calc_18_bin <int> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ ps_calc_19_bin <int> 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,...
## $ ps_calc_20_bin <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,...
pMiss <- function(x){sum(is.na(x))/length(x)*100}
pMiss <- apply(dataset[,-2],2,pMiss)
pMiss <- pMiss[pMiss > 0]
pMiss <- pMiss[order(pMiss, decreasing=T)]
pMiss
## ps_car_03_cat ps_car_05_cat ps_reg_03 ps_car_14 ps_car_07_cat
## 6.909426e+01 4.481838e+01 1.810826e+01 7.152083e+00 1.936792e+00
## ps_ind_05_cat ps_car_09_cat ps_ind_02_cat ps_car_01_cat ps_ind_04_cat
## 9.757209e-01 9.717559e-02 3.514719e-02 1.794321e-02 1.532229e-02
## ps_car_02_cat ps_car_11 ps_car_12
## 6.720304e-04 4.032182e-04 6.720304e-05
barplot(pMiss, col=4, las=3, main="Features with Missing Values")
barplot(table(dataset[2]), main=colnames(dataset[2]), col=c(4,5))
par(mfrow=c(3,3))
barplot(table(dataset[8]), main=colnames(dataset[8]), col=c(4,5))
barplot(table(dataset[9]), main=colnames(dataset[9]), col=c(4,5))
barplot(table(dataset[10]), main=colnames(dataset[10]), col=c(4,5))
barplot(table(dataset[11]), main=colnames(dataset[11]), col=c(4,5))
barplot(table(dataset[12]), main=colnames(dataset[12]), col=c(4,5))
barplot(table(dataset[13]), main=colnames(dataset[13]), col=c(4,5))
barplot(table(dataset[14]), main=colnames(dataset[14]), col=c(4,5))
barplot(table(dataset[15]), main=colnames(dataset[15]), col=c(4,5))
barplot(table(dataset[18]), main=colnames(dataset[18]), col=c(4,5))
par(mfrow=c(3,3))
barplot(table(dataset[19]), main=colnames(dataset[19]), col=c(4,5))
barplot(table(dataset[20]), main=colnames(dataset[20]), col=c(4,5))
barplot(table(dataset[54]), main=colnames(dataset[54]), col=c(4,5))
barplot(table(dataset[55]), main=colnames(dataset[55]), col=c(4,5))
barplot(table(dataset[56]), main=colnames(dataset[56]), col=c(4,5))
barplot(table(dataset[57]), main=colnames(dataset[57]), col=c(4,5))
barplot(table(dataset[58]), main=colnames(dataset[58]), col=c(4,5))
barplot(table(dataset[59]), main=colnames(dataset[59]), col=c(4,5))
par(mfrow=c(3,3))
barplot(table(dataset[4]), main=colnames(dataset[4]), col=c(1:8))
barplot(table(dataset[6]), main=colnames(dataset[6]), col=c(1:8))
barplot(table(dataset[7]), main=colnames(dataset[7]), col=c(1:8))
barplot(table(dataset[24]), main=colnames(dataset[24]), col=c(1:8))
barplot(table(dataset[25]), main=colnames(dataset[25]), col=c(1:8))
barplot(table(dataset[26]), main=colnames(dataset[26]), col=c(1:8))
barplot(table(dataset[27]), main=colnames(dataset[27]), col=c(1:8))
barplot(table(dataset[28]), main=colnames(dataset[28]), col=c(1:8))
barplot(table(dataset[29]), main=colnames(dataset[29]), col=c(1:8))
par(mfrow=c(3,2))
barplot(table(dataset[30]), main=colnames(dataset[30]), col=c(1:8))
barplot(table(dataset[31]), main=colnames(dataset[31]), col=c(1:8))
barplot(table(dataset[32]), main=colnames(dataset[32]), col=c(1:8))
barplot(table(dataset[33]), main=colnames(dataset[33]), col=c(1:8))
barplot(table(dataset[34]), main=colnames(dataset[34]), col=c(1:8))
par(mfrow=c(3,3))
barplot(table(dataset[3]), main=colnames(dataset[3]), col=c(1:8))
barplot(table(dataset[5]), main=colnames(dataset[5]), col=c(1:8))
barplot(table(dataset[16]), main=colnames(dataset[16]), col=c(1:8))
barplot(table(dataset[17]), main=colnames(dataset[17]), col=c(1:8))
barplot(table(dataset[35]), main=colnames(dataset[35]), col=c(1:8))
barplot(table(dataset[43]), main=colnames(dataset[43]), col=c(1:8))
barplot(table(dataset[44]), main=colnames(dataset[44]), col=c(1:8))
barplot(table(dataset[45]), main=colnames(dataset[45]), col=c(1:8))
barplot(table(dataset[46]), main=colnames(dataset[46]), col=c(1:8))
par(mfrow=c(3,3))
barplot(table(dataset[47]), main=colnames(dataset[47]), col=c(1:8))
barplot(table(dataset[48]), main=colnames(dataset[48]), col=c(1:8))
barplot(table(dataset[49]), main=colnames(dataset[49]), col=c(1:8))
barplot(table(dataset[50]), main=colnames(dataset[50]), col=c(1:8))
barplot(table(dataset[51]), main=colnames(dataset[51]), col=c(1:8))
barplot(table(dataset[52]), main=colnames(dataset[52]), col=c(1:8))
barplot(table(dataset[53]), main=colnames(dataset[53]), col=c(1:8))
boxplot(dataset[,c(21:23,36:42)], col=3, las=3)
par(mfrow=c(3,2))
hist(dataset[,21], main=colnames(dataset[21]), col=4)
hist(dataset[,22], main=colnames(dataset[22]), col=4)
hist(dataset[,23], main=colnames(dataset[23]), col=4)
hist(dataset[,36], main=colnames(dataset[36]), col=4)
hist(dataset[,37], main=colnames(dataset[37]), col=4)
hist(dataset[,38], main=colnames(dataset[38]), col=4)
par(mfrow=c(2,2))
hist(dataset[,39], main=colnames(dataset[39]), col=4)
hist(dataset[,40], main=colnames(dataset[40]), col=4)
hist(dataset[,41], main=colnames(dataset[41]), col=4)
hist(dataset[,42], main=colnames(dataset[42]), col=4)
par(mfrow=c(2,2))
mosaicplot(dataset[,8]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[8]),shade=T,legend=T)
mosaicplot(dataset[,9]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[9]),shade=T,legend=T)
mosaicplot(dataset[,10]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[10]),shade=T,legend=T)
mosaicplot(dataset[,11]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[11]),shade=T,legend=T)
par(mfrow=c(2,2))
mosaicplot(dataset[,12]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[12]),shade=T,legend=T)
mosaicplot(dataset[,13]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[13]),shade=T,legend=T)
mosaicplot(dataset[,14]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[14]),shade=T,legend=T)
mosaicplot(dataset[,15]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[15]),shade=T,legend=T)
par(mfrow=c(2,2))
mosaicplot(dataset[,18]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[18]),shade=T,legend=T)
mosaicplot(dataset[,19]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[19]),shade=T,legend=T)
mosaicplot(dataset[,20]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[20]),shade=T,legend=T)
mosaicplot(dataset[,54]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[54]),shade=T,legend=T)
par(mfrow=c(2,2))
mosaicplot(dataset[,55]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[55]),shade=T,legend=T)
mosaicplot(dataset[,56]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[56]),shade=T,legend=T)
mosaicplot(dataset[,57]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[57]),shade=T,legend=T)
mosaicplot(dataset[,58]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[58]),shade=T,legend=T)
mosaicplot(dataset[,59]~dataset[,2],main="target/bin", ylab=colnames(dataset[2]),
xlab=colnames(dataset[59]),shade=T,legend=T)
par(mfrow=c(2,2))
mosaicplot(dataset[,4]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[4]),shade=T,legend=T)
mosaicplot(dataset[,6]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[6]),shade=T,legend=T)
mosaicplot(dataset[,7]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[7]),shade=T,legend=T)
mosaicplot(dataset[,24]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[24]),shade=T,legend=T)
par(mfrow=c(2,2))
mosaicplot(dataset[,25]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[25]),shade=T,legend=T)
mosaicplot(dataset[,26]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[26]),shade=T,legend=T)
mosaicplot(dataset[,27]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[27]),shade=T,legend=T)
mosaicplot(dataset[,28]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[28]),shade=T,legend=T)
par(mfrow=c(2,2))
mosaicplot(dataset[,29]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[29]),shade=T,legend=T)
mosaicplot(dataset[,30]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[30]),shade=T,legend=T)
mosaicplot(dataset[,31]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[31]),shade=T,legend=T)
mosaicplot(dataset[,32]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[32]),shade=T,legend=T)
mosaicplot(dataset[,33]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[33]),shade=T,legend=T)
mosaicplot(dataset[,34]~dataset[,2],main="target/cat", ylab=colnames(dataset[2]),
xlab=colnames(dataset[34]),shade=T,legend=T)
par(mfrow=c(3,3))
mosaicplot(dataset[,3]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[3]),shade=T,legend=T)
mosaicplot(dataset[,5]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[5]),shade=T,legend=T)
mosaicplot(dataset[,16]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[16]),shade=T,legend=T)
mosaicplot(dataset[,17]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[17]),shade=T,legend=T)
mosaicplot(dataset[,35]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[35]),shade=T,legend=T)
mosaicplot(dataset[,43]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[43]),shade=T,legend=T)
mosaicplot(dataset[,44]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[44]),shade=T,legend=T)
mosaicplot(dataset[,45]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[45]),shade=T,legend=T)
mosaicplot(dataset[,46]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[46]),shade=T,legend=T)
par(mfrow=c(3,3))
mosaicplot(dataset[,47]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[47]),shade=T,legend=T)
mosaicplot(dataset[,48]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[48]),shade=T,legend=T)
mosaicplot(dataset[,49]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[49]),shade=T,legend=T)
mosaicplot(dataset[,50]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[50]),shade=T,legend=T)
mosaicplot(dataset[,51]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[51]),shade=T,legend=T)
mosaicplot(dataset[,52]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[52]),shade=T,legend=T)
mosaicplot(dataset[,53]~dataset[,2],main="target/int", ylab=colnames(dataset[2]),
xlab=colnames(dataset[53]),shade=T,legend=T)
cross_plot(dataset[, c('ps_reg_01', 'target')],
str_input=, str_target="target", plot_type='quantity')
cross_plot(dataset[, c('ps_reg_02', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
cross_plot(dataset[, c('ps_reg_03', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_reg_03' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"
cross_plot(dataset[, c('ps_car_12', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_car_12' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"
cross_plot(dataset[, c('ps_car_13', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_car_13' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"
cross_plot(dataset[, c('ps_car_14', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
## [1] "Plotting transformed variable 'ps_car_14' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'"
cross_plot(dataset[, c('ps_car_15', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
cross_plot(dataset[, c('ps_calc_01', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
cross_plot(dataset[, c('ps_calc_02', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
cross_plot(dataset[, c('ps_calc_03', 'target')],
str_input=, str_target="target", plot_type= 'quantity')
num <- dataset[,c(21:23,36:42)]
num %>%
cor(use="complete.obs",method = "spearman") %>%
corrplot(type="lower", tl.col = "black", diag=FALSE)