Loading libraries

library(tidyverse)
library(corrplot)
library(caret)
library(polycor)
library(car)

Loading data

train <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/train.csv",na.strings=c("-1","-1.0"))
test <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/test.csv",na.strings=c("-1","-1.0"))
dataset <- bind_rows(train,test)

Univariate analysis

Variables are splitted on categorical (categorical suffix, binary, integer ordinal) and continuous (numerical)

Outcome variable

summary(dataset[,2]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0       0       0       0       0       1  892816
table(dataset[,2])
## 
##      0      1 
## 573518  21694
table(dataset[,2])/length(train[,2])
## 
##          0          1 
## 0.96355248 0.03644752
miss_target <- sum(is.na(dataset[,2]) == TRUE)
miss_target #they are values to predict in the test dataset
## [1] 892816

explanatory categorical variables

binary_suffix

summary(dataset[,8]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3934  1.0000  1.0000
table(dataset[,8])
## 
##      0      1 
## 902572 585456
table(dataset[,8])/length(dataset[,8])
## 
##         0         1 
## 0.6065558 0.3934442
miss_ps_ind_06_bin <- sum(is.na(dataset[,8]) == TRUE)
miss_ps_ind_06_bin 
## [1] 0
summary(dataset[,9]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2571  1.0000  1.0000
table(dataset[,9])
## 
##       0       1 
## 1105415  382613
table(dataset[,9])/length(dataset[,9])
## 
##         0         1 
## 0.7428724 0.2571276
miss_ps_ind_07_bin <- sum(is.na(dataset[,9]) == TRUE)
miss_ps_ind_07_bin 
## [1] 0
summary(dataset[,10]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1638  0.0000  1.0000
table(dataset[,10])
## 
##       0       1 
## 1244343  243685
table(dataset[,10])/length(dataset[,10])
## 
##         0         1 
## 0.8362363 0.1637637
miss_ps_ind_08_bin <- sum(is.na(dataset[,10]) == TRUE)
miss_ps_ind_08_bin 
## [1] 0
summary(dataset[,10]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1638  0.0000  1.0000
table(dataset[,10])
## 
##       0       1 
## 1244343  243685
table(dataset[,10])/length(dataset[,10])
## 
##         0         1 
## 0.8362363 0.1637637
miss_ps_ind_08_bin <- sum(is.na(dataset[,10]) == TRUE)
miss_ps_ind_08_bin 
## [1] 0
summary(dataset[,11]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1857  0.0000  1.0000
table(dataset[,11])
## 
##       0       1 
## 1211754  276274
miss_ps_ind_09_bin <- sum(is.na(dataset[,11]) == TRUE)
miss_ps_ind_09_bin
## [1] 0
summary(dataset[,12]) 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.000373 0.000000 1.000000
table(dataset[,12])
## 
##       0       1 
## 1487473     555
table(dataset[,12])/length(dataset[,12])
## 
##            0            1 
## 0.9996270231 0.0003729769
miss_ps_ind_10_bin <- sum(is.na(dataset[,12]) == TRUE)
miss_ps_ind_10_bin
## [1] 0
summary(dataset[,13]) 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.001634 0.000000 1.000000
table(dataset[,13])
## 
##       0       1 
## 1485597    2431
table(dataset[,13])/length(dataset[,13])
## 
##           0           1 
## 0.998366294 0.001633706
miss_ps_ind_11_bin <- sum(is.na(dataset[,13]) == TRUE)
miss_ps_ind_11_bin 
## [1] 0
summary(dataset[,14]) 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.009401 0.000000 1.000000
table(dataset[,14])
## 
##       0       1 
## 1474039   13989
table(dataset[,14])/length(dataset[,14])
## 
##           0           1 
## 0.990598967 0.009401033
miss_ps_ind_12_bin <- sum(is.na(dataset[,14]) == TRUE)
miss_ps_ind_12_bin
## [1] 0
summary(dataset[,15]) 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.001003 0.000000 1.000000
table(dataset[,15])
## 
##       0       1 
## 1486536    1492
table(dataset[,15])/length(dataset[,15])
## 
##           0           1 
## 0.998997331 0.001002669
miss_ps_ind_13_bin <- sum(is.na(dataset[,15]) == TRUE)
miss_ps_ind_13_bin 
## [1] 0
summary(dataset[,18]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6607  1.0000  1.0000
table(dataset[,18])
## 
##      0      1 
## 504912 983116
table(dataset[,18])/length(dataset[,18])
## 
##         0         1 
## 0.3393162 0.6606838
miss_ps_ind_16_bin <- sum(is.na(dataset[,18]) == TRUE)
miss_ps_ind_16_bin 
## [1] 0
summary(dataset[,19]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1207  0.0000  1.0000
table(dataset[,19])
## 
##       0       1 
## 1308465  179563
table(dataset[,19])/length(dataset[,19])
## 
##         0         1 
## 0.8793282 0.1206718
miss_ps_ind_17_bin <- sum(is.na(dataset[,19]) == TRUE)
miss_ps_ind_17_bin 
## [1] 0
summary(dataset[,20]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1544  0.0000  1.0000
table(dataset[,20])
## 
##       0       1 
## 1258333  229695
table(dataset[,20])/length(dataset[,20])
## 
##        0        1 
## 0.845638 0.154362
miss_ps_ind_18_bin <- sum(is.na(dataset[,20]) == TRUE)
miss_ps_ind_18_bin 
## [1] 0
summary(dataset[,54]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1232  0.0000  1.0000
table(dataset[,54])
## 
##       0       1 
## 1304699  183329
table(dataset[,54])/length(dataset[,54])
## 
##         0         1 
## 0.8767973 0.1232027
miss_ps_calc_15_bin <- sum(is.na(dataset[,54]) == TRUE)
miss_ps_calc_15_bin 
## [1] 0
summary(dataset[,56]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.5545  1.0000  1.0000
table(dataset[,56])
## 
##      0      1 
## 662963 825065
table(dataset[,56])/length(dataset[,56])
## 
##         0         1 
## 0.4455313 0.5544687
miss_ps_calc_17_bin <- sum(is.na(dataset[,56]) == TRUE)
miss_ps_calc_17_bin 
## [1] 0
summary(dataset[,57]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2876  1.0000  1.0000
table(dataset[,57])
## 
##       0       1 
## 1060145  427883
table(dataset[,57])/length(dataset[,57])
## 
##         0         1 
## 0.7124496 0.2875504
miss_ps_calc_18_bin <- sum(is.na(dataset[,57]) == TRUE)
miss_ps_calc_18_bin 
## [1] 0
summary(dataset[,58]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3492  1.0000  1.0000
table(dataset[,58])
## 
##      0      1 
## 968385 519643
table(dataset[,58])/length(dataset[,58])
## 
##         0         1 
## 0.6507841 0.3492159
miss_ps_calc_19_bin <- sum(is.na(dataset[,58]) == TRUE)
miss_ps_calc_19_bin
## [1] 0
summary(dataset[,59]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1528  0.0000  1.0000
table(dataset[,59])
## 
##       0       1 
## 1260681  227347
table(dataset[,59])/length(dataset[,59])
## 
##         0         1 
## 0.8472159 0.1527841
miss_ps_calc_20_bin <- sum(is.na(dataset[,59]) == TRUE)
miss_ps_calc_20_bin 
## [1] 0

cat_suffix

summary(dataset[,4]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    1.00    1.00    1.36    2.00    4.00     523
table(dataset[,4])
## 
##       1       2       3       4 
## 1079327  309747   70172   28259
table(dataset[,4])/length(dataset[,4])
## 
##          1          2          3          4 
## 0.72534052 0.20815939 0.04715771 0.01899091
miss_ps_ind_02_cat <- sum(is.na(dataset[,4]) == TRUE)
miss_ps_ind_02_cat 
## [1] 523
miss_ps_ind_02_cat/length(dataset[,4])
## [1] 0.0003514719
summary(dataset[,6]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4173  1.0000  1.0000     228
table(dataset[,6])
## 
##      0      1 
## 866864 620936
table(dataset[,6])/length(dataset[,6])
## 
##         0         1 
## 0.5825589 0.4172878
miss_ps_ind_04_cat <- sum(is.na(dataset[,6]) == TRUE)
miss_ps_ind_04_cat 
## [1] 228
miss_ps_ind_04_cat/length(dataset[,6])*100
## [1] 0.01532229
summary(dataset[,7]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.421   0.000   6.000   14519
table(dataset[,7])
## 
##       0       1       2       3       4       5       6 
## 1319412   20737   10707   20754   45706    4316   51877
table(dataset[,7])/length(dataset[,7])
## 
##           0           1           2           3           4           5 
## 0.886684928 0.013935894 0.007195429 0.013947318 0.030715820 0.002900483 
##           6 
## 0.034862919
miss_ps_ind_05_cat <- sum(is.na(dataset[,7]) == TRUE)
miss_ps_ind_05_cat 
## [1] 14519
miss_ps_ind_05_cat/length(dataset[,7])*100
## [1] 0.9757209
summary(dataset[,24]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   7.000   7.000   8.295  11.000  11.000     267
table(dataset[,24])
## 
##      0      1      2      3      4      5      6      7      8      9 
##  14844   3379   5242  16682  65720  45082 155779 449617  37603  50501 
##     10     11 
## 124587 518725
table(dataset[,24])/length(dataset[,24])
## 
##           0           1           2           3           4           5 
## 0.009975619 0.002270791 0.003522783 0.011210811 0.044165836 0.030296473 
##           6           7           8           9          10          11 
## 0.104688218 0.302156277 0.025270358 0.033938205 0.083726247 0.348598951
miss_ps_car_01_cat <- sum(is.na(dataset[,24]) == TRUE)
miss_ps_car_01_cat 
## [1] 267
miss_ps_car_01_cat/length(dataset[,24])*100
## [1] 0.01794321
summary(dataset[,25]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  1.0000  1.0000  0.8299  1.0000  1.0000      10
table(dataset[,25])
## 
##       0       1 
##  253039 1234979
table(dataset[,25])/length(dataset[,25])
## 
##         0         1 
## 0.1700499 0.8299434
miss_ps_car_02_cat <- sum(is.na(dataset[,25]) == TRUE)
miss_ps_car_02_cat 
## [1] 10
miss_ps_car_02_cat/length(dataset[,25])*100
## [1] 0.0006720304
summary(dataset[,26]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     1.0     0.6     1.0     1.0 1028142
table(dataset[,26])
## 
##      0      1 
## 183044 276842
table(dataset[,26])/length(dataset[,26])
## 
##         0         1 
## 0.1230111 0.1860462
miss_ps_car_03_cat <- sum(is.na(dataset[,26]) == TRUE)
miss_ps_car_03_cat 
## [1] 1028142
miss_ps_car_03_cat/length(dataset[,26])*100
## [1] 69.09426
summary(dataset[,27]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7256  0.0000  9.0000
table(dataset[,27])
## 
##       0       1       2       3       4       5       6       7       8 
## 1241334   80561   59088    1713     627    1330    3937     370   51211 
##       9 
##   47857
table(dataset[,27])/length(dataset[,27])
## 
##            0            1            2            3            4 
## 0.8342141411 0.0541394382 0.0397089302 0.0011511880 0.0004213630 
##            5            6            7            8            9 
## 0.0008938004 0.0026457835 0.0002486512 0.0344153470 0.0321613572
miss_ps_car_04_cat <- sum(is.na(dataset[,27]) == TRUE)
miss_ps_car_04_cat 
## [1] 0
summary(dataset[,28]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     1.0     0.5     1.0     1.0  666910
table(dataset[,28])
## 
##      0      1 
## 389558 431560
table(dataset[,28])/length(dataset[,28])
## 
##         0         1 
## 0.2617948 0.2900214
miss_ps_car_05_cat <- sum(is.na(dataset[,28]) == TRUE)
miss_ps_car_05_cat 
## [1] 666910
miss_ps_car_05_cat/length(dataset[,28])*100
## [1] 44.81838
summary(dataset[,29]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   7.000   6.561  11.000  17.000
table(dataset[,29])
## 
##      0      1      2      3      4      5      6      7      8      9 
## 275497 295574   4123  30031  77845   3479  52571  40643   3471  43810 
##     10     11     12     13     14     15     16     17 
##  83563 329890   5991  15356 147714  54151  11771  12548
table(dataset[,29])/length(dataset[,29])
## 
##           0           1           2           3           4           5 
## 0.185142349 0.198634703 0.002770781 0.020181744 0.052314204 0.002337994 
##           6           7           8           9          10          11 
## 0.035329308 0.027313330 0.002332617 0.029441650 0.056156873 0.221696097 
##          12          13          14          15          16          17 
## 0.004026134 0.010319698 0.099268293 0.036391116 0.007910469 0.008432637
miss_ps_car_06_cat <- sum(is.na(dataset[,29]) == TRUE)
miss_ps_car_06_cat 
## [1] 0
summary(dataset[,30]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   1.000   0.948   1.000   1.000   28820
table(dataset[,30])
## 
##       0       1 
##   76138 1383070
table(dataset[,30])/length(dataset[,30])
## 
##          0          1 
## 0.05116705 0.92946504
miss_ps_car_07_cat <- sum(is.na(dataset[,30]) == TRUE)
miss_ps_car_07_cat 
## [1] 28820
miss_ps_car_07_cat/length(dataset[,30])*100
## [1] 1.936792
summary(dataset[,31]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.8322  1.0000  1.0000
table(dataset[,31])
## 
##       0       1 
##  249663 1238365
table(dataset[,31])/length(dataset[,31])
## 
##         0         1 
## 0.1677811 0.8322189
miss_ps_car_08_cat <- sum(is.na(dataset[,31]) == TRUE)
miss_ps_car_08_cat 
## [1] 0
summary(dataset[,32]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   2.000   1.331   2.000   4.000    1446
table(dataset[,32])
## 
##      0      1      2      3      4 
## 486510  72947 883326  36798   7001
table(dataset[,32])/length(dataset[,32])
## 
##           0           1           2           3           4 
## 0.326949493 0.049022599 0.593621894 0.024729373 0.004704885
miss_ps_car_09_cat <- sum(is.na(dataset[,32]) == TRUE)
miss_ps_car_09_cat 
## [1] 1446
miss_ps_car_09_cat/length(dataset[,32])*100
## [1] 0.09717559
summary(dataset[,33]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.9921  1.0000  2.0000
table(dataset[,33])
## 
##       0       1       2 
##   12136 1475460     432
table(dataset[,33])/length(dataset[,33])
## 
##            0            1            2 
## 0.0081557605 0.9915539224 0.0002903171
miss_ps_car_10_cat <- sum(is.na(dataset[,33]) == TRUE)
miss_ps_car_10_cat 
## [1] 0
summary(dataset[,34]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   32.00   65.00   62.26   94.00  104.00
table(dataset[,34])
## 
##      1      2      3      4      5      6      7      8      9     10 
##   8228   6379   7992   3894  31198   6071  14249   6158   5457  21742 
##     11     12     13     14     15     16     17     18     19     20 
##  16667  18326   8371   6727   4645  17716   9725   3066  12535   4530 
##     21     22     23     24     25     26     27     28     29     30 
##   6070  20926   4072   5857  11658   8724  14912  31344  10957   5891 
##     31     32     33     34     35     36     37     38     39     40 
##  13143  31175   4611  11774   3821   7372  12623  22952  19211  12391 
##     41     42     43     44     45     46     47     48     49     50 
##   7963  12336   8261  14231   4316  13351   5044  15198  13672   4475 
##     51     52     53     54     55     56     57     58     59     60 
##  14987   8811  11658   4022  10565   3175   9382   3767   4479  19943 
##     61     62     63     64     65     66     67     68     69     70 
##   6886   7082   2722  55391  27800   7877  17806  20733   4553  18927 
##     71     72     73     74     75     76     77     78     79     80 
##   5933   7215   4614  12098   4343   6543   6142  18327   5190  14053 
##     81     82     83     84     85     86     87     88     89     90 
##   3608  26161  23506  12276  15838   9282  42872  11213  12730  13030 
##     91     92     93     94     95     96     97     98     99    100 
##   4553  16344   7220   8364   8972   5116   3692   7118  30303  11200 
##    101    102    103    104 
##  18416   5132  61062 212989
table(dataset[,34])/length(dataset[,34])
## 
##           1           2           3           4           5           6 
## 0.005529466 0.004286882 0.005370867 0.002616886 0.020966003 0.004079896 
##           7           8           9          10          11          12 
## 0.009575761 0.004138363 0.003667270 0.014611284 0.011200730 0.012315628 
##          13          14          15          16          17          18 
## 0.005625566 0.004520748 0.003121581 0.011905690 0.006535495 0.002060445 
##          19          20          21          22          23          24 
## 0.008423901 0.003044298 0.004079224 0.014062907 0.002736508 0.003936082 
##          25          26          27          28          29          30 
## 0.007834530 0.005862793 0.010021317 0.021064120 0.007363437 0.003958931 
##          31          32          33          34          35          36 
## 0.008832495 0.020950547 0.003098732 0.007912486 0.002567828 0.004954208 
##          37          38          39          40          41          42 
## 0.008483039 0.015424441 0.012910375 0.008327128 0.005351378 0.008290167 
##          43          44          45          46          47          48 
## 0.005551643 0.009563664 0.002900483 0.008972277 0.003389721 0.010213517 
##          49          50          51          52          53          54 
## 0.009187999 0.003007336 0.010071719 0.005921260 0.007834530 0.002702906 
##          55          56          57          58          59          60 
## 0.007100001 0.002133696 0.006304989 0.002531538 0.003010024 0.013402302 
##          61          62          63          64          65          66 
## 0.004627601 0.004759319 0.001829267 0.037224434 0.018682444 0.005293583 
##          67          68          69          70          71          72 
## 0.011966173 0.013933206 0.003059754 0.012719519 0.003987156 0.004848699 
##          73          74          75          76          77          78 
## 0.003100748 0.008130223 0.002918628 0.004397095 0.004127611 0.012316300 
##          79          80          81          82          83          84 
## 0.003487838 0.009444043 0.002424686 0.017580986 0.015796746 0.008249845 
##          85          86          87          88          89          90 
## 0.010643617 0.006237786 0.028811286 0.007535476 0.008554947 0.008756556 
##          91          92          93          94          95          96 
## 0.003059754 0.010983664 0.004852059 0.005620862 0.006029456 0.003438107 
##          97          98          99         100         101         102 
## 0.002481136 0.004783512 0.020364536 0.007526740 0.012376111 0.003448860 
##         103         104 
## 0.041035518 0.143135075
miss_ps_car_11_cat <- sum(is.na(dataset[,34]) == TRUE)
miss_ps_car_11_cat
## [1] 0

integer_type

summary(dataset[,3]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.902   3.000   7.000
table(dataset[,3])
## 
##      0      1      2      3      4      5      6      7 
## 469109 359925 205761 127634  84045 153663  44486  43405
table(dataset[,3])/length(dataset[,3])
## 
##          0          1          2          3          4          5 
## 0.31525549 0.24188053 0.13827764 0.08577392 0.05648079 0.10326620 
##          6          7 
## 0.02989594 0.02916948
miss_ps_ind_01 <- sum(is.na(dataset[,3]) == TRUE)
miss_ps_ind_01 
## [1] 0
summary(dataset[,5]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   4.000   4.418   6.000  11.000
table(dataset[,5])
## 
##      0      1      2      3      4      5      6      7      8      9 
##  38878 170922 240652 204836 167980 157043 150987 130452  98528  63551 
##     10     11 
##  35863  28336
table(dataset[,5])/length(dataset[,5])
## 
##          0          1          2          3          4          5 
## 0.02612720 0.11486477 0.16172545 0.13765601 0.11288766 0.10553766 
##          6          7          8          9         10         11 
## 0.10146785 0.08766771 0.06621381 0.04270820 0.02410102 0.01904265
miss_ps_ind_03 <- sum(is.na(dataset[,5]) == TRUE)
miss_ps_ind_03
## [1] 0
summary(dataset[,16]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01241 0.00000 4.00000
table(dataset[,16])
## 
##       0       1       2       3       4 
## 1472125   13707    1847     330      19
table(dataset[,16])/length(dataset[,16])
## 
##            0            1            2            3            4 
## 9.893127e-01 9.211520e-03 1.241240e-03 2.217700e-04 1.276858e-05
miss_ps_ind_14 <- sum(is.na(dataset[,16]) == TRUE)
miss_ps_ind_14
## [1] 0
summary(dataset[,17]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.000   7.000   7.298  10.000  13.000
table(dataset[,17])
## 
##      0      1      2      3      4      5      6      7      8      9 
##  78880  29476  46951  81057 104192 106079 145977 163865 149445 113902 
##     10     11     12     13 
## 135354 133059 112023  87768
table(dataset[,17])/length(dataset[,17])
## 
##          0          1          2          3          4          5 
## 0.05300976 0.01980877 0.03155250 0.05447277 0.07002019 0.07128831 
##          6          7          8          9         10         11 
## 0.09810098 0.11012226 0.10043158 0.07654560 0.09096200 0.08941969 
##         12         13 
## 0.07528286 0.05898276
miss_ps_ind_15 <- sum(is.na(dataset[,17]) == TRUE)
miss_ps_ind_15
## [1] 0
summary(dataset[,35]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.000   3.000   2.346   3.000   3.000       6
table(dataset[,35])
## 
##      0      1      2      3 
##  64634 152661 473339 797388
table(dataset[,35])/length(dataset[,35])
## 
##          0          1          2          3 
## 0.04343601 0.10259283 0.31809818 0.53586895
miss_ps_car_11 <- sum(is.na(dataset[,35]) == TRUE)
miss_ps_car_11
## [1] 6
miss_ps_car_11/length(dataset[,35])*100
## [1] 0.0004032182
summary(dataset[,43]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   2.000   2.372   3.000   5.000
table(dataset[,43])
## 
##      0      1      2      3      4      5 
##  59597 270227 485574 438576 198267  35787
table(dataset[,43])/length(dataset[,43])
## 
##          0          1          2          3          4          5 
## 0.04005099 0.18160075 0.32632047 0.29473639 0.13324144 0.02404995
miss_ps_calc_04 <- sum(is.na(dataset[,43]) == TRUE)
miss_ps_calc_04
## [1] 0
summary(dataset[,44]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   1.886   3.000   6.000
table(dataset[,44])
## 
##      0      1      2      3      4      5      6 
## 153919 426455 487503 297928 102032  18766   1425
table(dataset[,44])/length(dataset[,44])
## 
##            0            1            2            3            4 
## 0.1034382418 0.2865907093 0.3276168190 0.2002166626 0.0685686022 
##            5            6 
## 0.0126113218 0.0009576433
miss_ps_calc_05 <- sum(is.na(dataset[,44]) == TRUE)
miss_ps_calc_05
## [1] 0
summary(dataset[,45]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   7.000   8.000   7.688   9.000  10.000
table(dataset[,45])
## 
##      0      1      2      3      4      5      6      7      8      9 
##      1     26    317   2913  16706  66566 184040 350227 437098 322361 
##     10 
## 107773
table(dataset[,45])/length(dataset[,45])
## 
##            0            1            2            3            4 
## 6.720304e-07 1.747279e-05 2.130336e-04 1.957624e-03 1.122694e-02 
##            5            6            7            8            9 
## 4.473437e-02 1.236805e-01 2.353632e-01 2.937431e-01 2.166364e-01 
##           10 
## 7.242673e-02
miss_ps_calc_06 <- sum(is.na(dataset[,45]) == TRUE)
miss_ps_calc_06
## [1] 0
summary(dataset[,46]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   3.008   4.000   9.000
table(dataset[,46])
## 
##      0      1      2      3      4      5      6      7      8      9 
##  38224 172836 346628 406921 305796 153767  51322  11082   1383     69
table(dataset[,46])/length(dataset[,46])
## 
##            0            1            2            3            4 
## 0.0256876887 0.1161510402 0.2329445414 0.2734632682 0.2055041975 
##            5            6            7            8            9 
## 0.1033360931 0.0344899424 0.0074474405 0.0009294180 0.0000463701
miss_ps_calc_07 <- sum(is.na(dataset[,46]) == TRUE)
miss_ps_calc_07
## [1] 0
summary(dataset[,47]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   9.226  10.000  12.000
table(dataset[,47])
## 
##      1      2      3      4      5      6      7      8      9     10 
##      2     29    265   2111  11180  43098 123797 257182 378846 379246 
##     11     12 
## 228801  63471
table(dataset[,47])/length(dataset[,47])
## 
##            1            2            3            4            5 
## 1.344061e-06 1.948888e-05 1.780880e-04 1.418656e-03 7.513299e-03 
##            6            7            8            9           10 
## 2.896316e-02 8.319534e-02 1.728341e-01 2.545960e-01 2.548648e-01 
##           11           12 
## 1.537612e-01 4.265444e-02
miss_ps_calc_08 <- sum(is.na(dataset[,47]) == TRUE)
miss_ps_calc_08
## [1] 0
summary(dataset[,48]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.339   3.000   7.000
table(dataset[,48])
## 
##      0      1      2      3      4      5      6      7 
##  86476 303295 456770 381570 191957  57710   9568    682
table(dataset[,48])/length(dataset[,48])
## 
##            0            1            2            3            4 
## 0.0581144978 0.2038234496 0.3069633098 0.2564266264 0.1290009328 
##            5            6            7 
## 0.0387828724 0.0064299865 0.0004583247
miss_ps_calc_09 <- sum(is.na(dataset[,48]) == TRUE)
miss_ps_calc_09
## [1] 0
summary(dataset[,49]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   6.000   8.000   8.439  10.000  25.000
table(dataset[,49])
## 
##      0      1      2      3      4      5      6      7      8      9 
##    309   2652  11471  32179  67995 115159 161555 194499 205217 192144 
##     10     11     12     13     14     15     16     17     18     19 
## 163008 124002  87453  57088  34429  19481  10180   5135   2348   1011 
##     20     21     22     23     24     25 
##    440    182     68     16      4      3
table(dataset[,49])/length(dataset[,49])
## 
##            0            1            2            3            4 
## 2.076574e-04 1.782225e-03 7.708860e-03 2.162527e-02 4.569470e-02 
##            5            6            7            8            9 
## 7.739034e-02 1.085699e-01 1.307092e-01 1.379121e-01 1.291266e-01 
##           10           11           12           13           14 
## 1.095463e-01 8.333311e-02 5.877107e-02 3.836487e-02 2.313733e-02 
##           15           16           17           18           19 
## 1.309182e-02 6.841269e-03 3.450876e-03 1.577927e-03 6.794227e-04 
##           20           21           22           23           24 
## 2.956934e-04 1.223095e-04 4.569806e-05 1.075249e-05 2.688121e-06 
##           25 
## 2.016091e-06
miss_ps_calc_10 <- sum(is.na(dataset[,49]) == TRUE)
miss_ps_calc_10
## [1] 0
summary(dataset[,50]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    4.00    5.00    5.44    7.00   20.00
table(dataset[,50])
## 
##      0      1      2      3      4      5      6      7      8      9 
##   6329  35157  95733 173074 235407 256571 232989 180725 122227  74547 
##     10     11     12     13     14     15     16     17     18     19 
##  40262  19962   8978   3801   1471    524    190     51     23      6 
##     20 
##      1
table(dataset[,50])/length(dataset[,50])
## 
##            0            1            2            3            4 
## 4.253280e-03 2.362657e-02 6.433548e-02 1.163110e-01 1.582007e-01 
##            5            6            7            8            9 
## 1.724235e-01 1.565757e-01 1.214527e-01 8.214026e-02 5.009785e-02 
##           10           11           12           13           14 
## 2.705729e-02 1.341507e-02 6.033489e-03 2.554387e-03 9.885567e-04 
##           15           16           17           18           19 
## 3.521439e-04 1.276858e-04 3.427355e-05 1.545670e-05 4.032182e-06 
##           20 
## 6.720304e-07
miss_ps_calc_11 <- sum(is.na(dataset[,50]) == TRUE)
miss_ps_calc_11
## [1] 0
summary(dataset[,51]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   1.441   2.000  11.000
table(dataset[,51])
## 
##      0      1      2      3      4      5      6      7      8      9 
## 352711 507536 364715 175675  63648  18241   4383    937    149     29 
##     10     11 
##      3      1
table(dataset[,51])/length(dataset[,51])
## 
##            0            1            2            3            4 
## 2.370325e-01 3.410796e-01 2.450996e-01 1.180589e-01 4.277339e-02 
##            5            6            7            8            9 
## 1.225851e-02 2.945509e-03 6.296925e-04 1.001325e-04 1.948888e-05 
##           10           11 
## 2.016091e-06 6.720304e-07
miss_ps_calc_12 <- sum(is.na(dataset[,51]) == TRUE)
miss_ps_calc_12
## [1] 0
summary(dataset[,52]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   2.874   4.000  15.000
table(dataset[,52])
## 
##      0      1      2      3      4      5      6      7      8      9 
##  84026 241100 347605 332214 239373 136916  65687  27229   9685   3032 
##     10     11     12     13     14     15 
##    889    209     51      9      2      1
table(dataset[,52])/length(dataset[,52])
## 
##            0            1            2            3            4 
## 5.646802e-02 1.620265e-01 2.336011e-01 2.232579e-01 1.608659e-01 
##            5            6            7            8            9 
## 9.201171e-02 4.414366e-02 1.829871e-02 6.508614e-03 2.037596e-03 
##           10           11           12           13           14 
## 5.974350e-04 1.404543e-04 3.427355e-05 6.048273e-06 1.344061e-06 
##           15 
## 6.720304e-07
miss_ps_calc_13 <- sum(is.na(dataset[,52]) == TRUE)
miss_ps_calc_13
## [1] 0
summary(dataset[,53]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    6.00    7.00    7.54    9.00   28.00
table(dataset[,53])
## 
##      0      1      2      3      4      5      6      7      8      9 
##    776   5875  22460  56722 106204 160704 201962 217553 204937 171448 
##     10     11     12     13     14     15     16     17     18     19 
## 129415  88435  55895  32216  17461   8674   4086   1913    778    326 
##     20     21     22     23     28 
##    114     49     20      4      1
table(dataset[,53])/length(dataset[,53])
## 
##            0            1            2            3            4 
## 5.214956e-04 3.948178e-03 1.509380e-02 3.811891e-02 7.137231e-02 
##            5            6            7            8            9 
## 1.079980e-01 1.357246e-01 1.462022e-01 1.377239e-01 1.152183e-01 
##           10           11           12           13           14 
## 8.697081e-02 5.943101e-02 3.756314e-02 2.165013e-02 1.173432e-02 
##           15           16           17           18           19 
## 5.829191e-03 2.745916e-03 1.285594e-03 5.228396e-04 2.190819e-04 
##           20           21           22           23           28 
## 7.661146e-05 3.292949e-05 1.344061e-05 2.688121e-06 6.720304e-07
miss_ps_calc_14 <- sum(is.na(dataset[,53]) == TRUE)
miss_ps_calc_14
## [1] 0

explanatory numerical variables

summary(dataset[,21]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.400   0.700   0.611   0.900   0.900
miss_ps_reg_01 <- sum(is.na(dataset[,21]) == TRUE)
miss_ps_reg_01
## [1] 0
sd(dataset[,21])
## [1] 0.2876763
var(dataset[,21])
## [1] 0.08275764
skew_ps_reg_01 <- mean((dataset[,21] - mean(dataset[,21]))^3/sd(dataset[,21])^3)
skew_ps_reg_01
## [1] -0.6404163
kurtosys_ps_reg_01 <- mean((dataset[,21] - mean(dataset[,21]))^4/sd(dataset[,21])^4) - 3
kurtosys_ps_reg_01
## [1] -0.8847695
boxplot.stats(dataset[,21])$out
## numeric(0)
summary(dataset[,22]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2000  0.3000  0.4396  0.6000  1.8000
miss_ps_reg_02 <- sum(is.na(dataset[,22]) == TRUE)
miss_ps_reg_02
## [1] 0
sd(dataset[,22])
## [1] 0.4045123
var(dataset[,22])
## [1] 0.1636302
skew_ps_reg_02 <- mean((dataset[,22] - mean(dataset[,22]))^3/sd(dataset[,22])^3)
skew_ps_reg_02
## [1] 1.280006
kurtosys_ps_reg_02 <- mean((dataset[,22] - mean(dataset[,22]))^4/sd(dataset[,22])^4) - 3
kurtosys_ps_reg_02
## [1] 1.118253
max(boxplot.stats(dataset[,22])$out)
## [1] 1.8
min(boxplot.stats(dataset[,22])$out)
## [1] 1.3
sum(table(boxplot.stats(dataset[,22])$out))
## [1] 92219
x <- dataset[,22]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75% 
## 0.2 0.6
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
##  5% 95% 
## 0.0 1.3
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.6
out1 <- dataset$ps_reg_02[dataset$ps_reg_02 < (qnt[1] - H)]
out2 <- dataset$ps_reg_02[dataset$ps_reg_02 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 0
sum(!is.na(out2))
## [1] 92219
sum(!is.na(out2))/length(dataset$ps_reg_02)
## [1] 0.06197397
summary(dataset[,23]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.06    0.63    0.80    0.89    1.09    4.42  269456
miss_ps_reg_03 <- sum(is.na(dataset[,23]) == TRUE)
miss_ps_reg_03
## [1] 269456
miss_ps_reg_03/length(dataset[,23])
## [1] 0.1810826
sd(dataset[,23])
## [1] NA
var(dataset[,23])
## [1] NA
skew_ps_reg_03 <- mean((dataset[,23] - mean(dataset[,23]))^3/sd(dataset[,23])^3)
skew_ps_reg_03
## [1] NA
kurtosys_ps_reg_03 <- mean((dataset[,23] - mean(dataset[,23]))^4/sd(dataset[,23])^4) - 3
kurtosys_ps_reg_03
## [1] NA
max(boxplot.stats(dataset[,23])$out)
## [1] 4.423517
min(boxplot.stats(dataset[,23])$out)
## [1] 1.763874
sum(table(boxplot.stats(dataset[,23])$out))
## [1] 26192
x <- dataset[,23]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
##       25%       75% 
## 0.6339361 1.0854147
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
##        5%       95% 
## 0.4911721 1.5858752
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.6772178
out1 <- dataset$ps_reg_03[dataset$ps_reg_03 < (qnt[1] - H)]
out2 <- dataset$ps_reg_03[dataset$ps_reg_03 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 0
sum(!is.na(out2))
## [1] 26192
sum(!is.na(out2))/length(dataset$ps_reg_03)
## [1] 0.01760182
summary(dataset[,36]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.1000  0.3162  0.3742  0.3800  0.4000  1.2649       1
miss_ps_car_12 <- sum(is.na(dataset[,36]) == TRUE)
miss_ps_car_12
## [1] 1
miss_ps_car_12/length(dataset[,36])
## [1] 6.720304e-07
sd(dataset[,36])
## [1] NA
var(dataset[,36])
## [1] NA
skew_ps_car_12 <- mean((dataset[,36] - mean(dataset[,36]))^3/sd(dataset[,36])^3)
skew_ps_car_12
## [1] NA
kurtosys_ps_car_12 <- mean((dataset[,36] - mean(dataset[,36]))^4/sd(dataset[,36])^4) - 3
kurtosys_ps_car_12
## [1] NA
max(boxplot.stats(dataset[,36])$out)
## [1] 1.264911
min(boxplot.stats(dataset[,36])$out)
## [1] 0.1
sum(table(boxplot.stats(dataset[,36])$out))
## [1] 38618
x <- dataset[,36]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
##       25%       75% 
## 0.3162278 0.4000000
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
##        5%       95% 
## 0.3160696 0.4688283
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.1256584
out1 <- dataset$ps_car_12[dataset$ps_car_12 < (qnt[1] - H)]
out2 <- dataset$ps_car_12[dataset$ps_car_12 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 140
sum(!is.na(out1))/length(dataset$ps_car_12)
## [1] 9.408425e-05
sum(!is.na(out2))
## [1] 38478
sum(!is.na(out2))/length(dataset$ps_car_12)
## [1] 0.02585838
summary(dataset[,37]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2506  0.6710  0.7660  0.8135  0.9061  4.0313
miss_ps_car_13 <- sum(is.na(dataset[,37]) == TRUE)
miss_ps_car_13
## [1] 0
sd(dataset[,37])
## [1] 0.2247024
var(dataset[,37])
## [1] 0.05049117
skew_ps_car_13 <- mean((dataset[,37] - mean(dataset[,37]))^3/sd(dataset[,37])^3)
skew_ps_car_13
## [1] 1.697218
kurtosys_ps_car_13 <- mean((dataset[,37] - mean(dataset[,37]))^4/sd(dataset[,37])^4) - 3
kurtosys_ps_car_13
## [1] 5.392546
max(boxplot.stats(dataset[,37])$out)
## [1] 4.031301
min(boxplot.stats(dataset[,37])$out)
## [1] 0.2506191
sum(table(boxplot.stats(dataset[,37])$out))
## [1] 67765
x <- dataset[,37]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
##       25%       75% 
## 0.6710052 0.9061429
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
##        5%       95% 
## 0.5426823 1.2368899
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.3527067
out1 <- dataset$ps_car_13[dataset$ps_car_13 < (qnt[1] - H)]
out2 <- dataset$ps_car_13[dataset$ps_car_13 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 36
sum(!is.na(out1))/length(dataset$ps_car_13)
## [1] 2.419309e-05
sum(!is.na(out2))
## [1] 67729
sum(!is.na(out2))/length(dataset$ps_car_13)
## [1] 0.04551594
summary(dataset[,38]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.11    0.35    0.37    0.37    0.40    0.64  106425
miss_ps_car_14 <- sum(is.na(dataset[,38]) == TRUE)
miss_ps_car_14
## [1] 106425
miss_ps_car_14/length(dataset[,38])
## [1] 0.07152083
sd(dataset[,38])
## [1] NA
var(dataset[,38])
## [1] NA
skew_ps_car_14 <- mean((dataset[,38] - mean(dataset[,38]))^3/sd(dataset[,38])^3)
skew_ps_car_14
## [1] NA
kurtosys_ps_car_14 <- mean((dataset[,38] - mean(dataset[,38]))^4/sd(dataset[,38])^4) - 3
kurtosys_ps_car_14
## [1] NA
max(boxplot.stats(dataset[,38])$out)
## [1] 0.6363961
min(boxplot.stats(dataset[,38])$out)
## [1] 0.1095445
sum(table(boxplot.stats(dataset[,38])$out))
## [1] 46945
x <- dataset[,38]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
##       25%       75% 
## 0.3504283 0.3977436
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
##        5%       95% 
## 0.3016621 0.4440721
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.07097299
out1 <- dataset$ps_car_14[dataset$ps_car_14 < (qnt[1] - H)]
out2 <- dataset$ps_car_14[dataset$ps_car_14 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 2302
sum(!is.na(out1))/length(dataset$ps_car_14)
## [1] 0.001547014
sum(!is.na(out2))
## [1] 44643
sum(!is.na(out2))/length(dataset$ps_car_14)
## [1] 0.03000145
summary(dataset[,39]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.828   3.317   3.067   3.606   3.742
miss_ps_car_15 <- sum(is.na(dataset[,39]) == TRUE)
miss_ps_car_15
## [1] 0
sd(dataset[,39])
## [1] 0.729951
var(dataset[,39])
## [1] 0.5328285
skew_ps_car_15 <- mean((dataset[,39] - mean(dataset[,39]))^3/sd(dataset[,39])^3)
skew_ps_car_15
## [1] -2.220883
kurtosys_ps_car_15 <- mean((dataset[,39] - mean(dataset[,39]))^4/sd(dataset[,39])^4) - 3
kurtosys_ps_car_15
## [1] 5.908309
max(boxplot.stats(dataset[,39])$out)
## [1] 1.414214
min(boxplot.stats(dataset[,39])$out)
## [1] 0
sum(table(boxplot.stats(dataset[,39])$out))
## [1] 68172
x <- dataset[,39]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
##      25%      75% 
## 2.828427 3.605551
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
##       5%      95% 
## 1.732051 3.741657
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 1.165686
out1 <- dataset$ps_car_15[dataset$ps_car_15 < (qnt[1] - H)]
out2 <- dataset$ps_car_15[dataset$ps_car_15 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 68172
sum(!is.na(out1))/length(dataset$ps_car_15)
## [1] 0.04581365
sum(!is.na(out2))
## [1] 0
summary(dataset[,40]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2000  0.4000  0.4497  0.7000  0.9000
miss_ps_calc_01 <- sum(is.na(dataset[,40]) == TRUE)
miss_ps_calc_01
## [1] 0
sd(dataset[,40])
## [1] 0.2872071
var(dataset[,40])
## [1] 0.0824879
skew_ps_calc_01 <- mean((dataset[,40] - mean(dataset[,40]))^3/sd(dataset[,40])^3)
skew_ps_calc_01
## [1] 0.0007329376
kurtosys_ps_calc_01 <- mean((dataset[,40] - mean(dataset[,40]))^4/sd(dataset[,40])^4) - 3
kurtosys_ps_calc_01
## [1] -1.22389
boxplot.stats(dataset[,40])$out
## numeric(0)
summary(dataset[,41]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2000  0.5000  0.4501  0.7000  0.9000
miss_ps_calc_02 <- sum(is.na(dataset[,41]) == TRUE)
miss_ps_calc_02
## [1] 0
sd(dataset[,41])
## [1] 0.2871817
var(dataset[,41])
## [1] 0.08247332
skew_ps_calc_02 <- mean((dataset[,41] - mean(dataset[,41]))^3/sd(dataset[,41])^3)
skew_ps_calc_02
## [1] -6.740767e-05
kurtosys_ps_calc_02 <- mean((dataset[,41] - mean(dataset[,41]))^4/sd(dataset[,41])^4) - 3
kurtosys_ps_calc_02
## [1] -1.223397
boxplot.stats(dataset[,41])$out
## numeric(0)
summary(dataset[,42]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.20    0.50    0.45    0.70    0.90
miss_ps_calc_03 <- sum(is.na(dataset[,42]) == TRUE)
miss_ps_calc_03
## [1] 0
sd(dataset[,42])
## [1] 0.2872136
var(dataset[,42])
## [1] 0.08249165
skew_ps_calc_03 <- mean((dataset[,42] - mean(dataset[,42]))^3/sd(dataset[,42])^3)
skew_ps_calc_03
## [1] 0.0002636599
kurtosys_ps_calc_03 <- mean((dataset[,42] - mean(dataset[,42]))^4/sd(dataset[,42])^4) - 3
kurtosys_ps_calc_03
## [1] -1.224441
boxplot.stats(dataset[,42])$out
## numeric(0)

outliers are under 10% for some numerical variables, I think is better to don’t normalize them because distribution change completely, I think is better to deal with them in other way

Bivariate Analysis

Analysis on categorical variables studying relationship between outcome variable and explanatory variables

cat <- dataset[,-c(1,21:23,36:42)]
for (i in 1:48){
cat[,i] <- as.factor(cat[,i])
}
y <- cat[,1]
print(chisq.test(table(y,cat[,2])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 2])
## X-squared = 255.05, df = 7, p-value < 2.2e-16
print(chisq.test(table(y,cat[,3])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 3])
## X-squared = 23.04, df = 3, p-value = 3.962e-05
print(chisq.test(table(y,cat[,4])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 4])
## X-squared = 660.12, df = 11, p-value < 2.2e-16
print(chisq.test(table(y,cat[,5])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 5])
## X-squared = 61.149, df = 1, p-value = 5.291e-15
print(chisq.test(table(y,cat[,6])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 6])
## X-squared = 771.16, df = 6, p-value < 2.2e-16
print(chisq.test(table(y,cat[,7])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 7])
## X-squared = 688.38, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,8])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 8])
## X-squared = 696.5, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,9])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 9])
## X-squared = 102.69, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,10])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 10])
## X-squared = 40.269, df = 1, p-value = 2.213e-10
print(chisq.test(table(y,cat[,11])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 11])
## X-squared = 1.4908, df = 1, p-value = 0.2221
print(chisq.test(table(y,cat[,12])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 12])
## X-squared = 2.1921, df = 1, p-value = 0.1387
print(chisq.test(table(y,cat[,13])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 13])
## X-squared = 35.879, df = 1, p-value = 2.1e-09
print(chisq.test(table(y,cat[,14])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 14])
## X-squared = 3.1888, df = 1, p-value = 0.07415
print(chisq.test(table(y,cat[,15])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 15])
## X-squared = 38.016, df = 4, p-value = 1.112e-07
print(chisq.test(table(y,cat[,16])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 16])
## X-squared = 334.55, df = 13, p-value < 2.2e-16
print(chisq.test(table(y,cat[,17])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 17])
## X-squared = 458.97, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,18])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 18])
## X-squared = 816.56, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,19])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 19])
## X-squared = 12.285, df = 1, p-value = 0.0004567
print(chisq.test(table(y,cat[,20])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 20])
## X-squared = 770.78, df = 11, p-value < 2.2e-16
print(chisq.test(table(y,cat[,21])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 21])
## X-squared = 591.82, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,22])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 22])
## X-squared = 78.848, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,23])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 23])
## X-squared = 866.46, df = 9, p-value < 2.2e-16
print(chisq.test(table(y,cat[,24])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 24])
## X-squared = 0.24066, df = 1, p-value = 0.6237
print(chisq.test(table(y,cat[,25])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 25])
## X-squared = 831.49, df = 17, p-value < 2.2e-16
print(chisq.test(table(y,cat[,26])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 26])
## X-squared = 226.87, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,27])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 27])
## X-squared = 246.01, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,28])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 28])
## X-squared = 420.39, df = 4, p-value < 2.2e-16
print(chisq.test(table(y,cat[,29])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 29])
## X-squared = 0.64897, df = 2, p-value = 0.7229
print(chisq.test(table(y,cat[,30])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 30])
## X-squared = 1490.1, df = 103, p-value < 2.2e-16
print(chisq.test(table(y,cat[,31])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 31])
## X-squared = 255.65, df = 3, p-value < 2.2e-16
print(chisq.test(table(y,cat[,32])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 32])
## X-squared = 3.0938, df = 5, p-value = 0.6855
print(chisq.test(table(y,cat[,33])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 33])
## X-squared = 6.9657, df = 6, p-value = 0.324
print(chisq.test(table(y,cat[,34])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 34])
## X-squared = 7.3787, df = 10, p-value = 0.6893
print(chisq.test(table(y,cat[,35])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 35])
## X-squared = 11.443, df = 9, p-value = 0.2466
print(chisq.test(table(y,cat[,36])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 36])
## X-squared = NaN, df = 11, p-value = NA
print(chisq.test(table(y,cat[,37])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 37])
## X-squared = 6.1507, df = 7, p-value = 0.5223
print(chisq.test(table(y,cat[,38])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 38])
## X-squared = 24.017, df = 25, p-value = 0.5184
print(chisq.test(table(y,cat[,39])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 39])
## X-squared = NaN, df = 20, p-value = NA
print(chisq.test(table(y,cat[,40])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 40])
## X-squared = NaN, df = 11, p-value = NA
print(chisq.test(table(y,cat[,41])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 41])
## X-squared = NaN, df = 15, p-value = NA
print(chisq.test(table(y,cat[,42])))
## 
##  Pearson's Chi-squared test
## 
## data:  table(y, cat[, 42])
## X-squared = NaN, df = 24, p-value = NA
print(chisq.test(table(y,cat[,43])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 43])
## X-squared = 0.13529, df = 1, p-value = 0.713
print(chisq.test(table(y,cat[,44])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 44])
## X-squared = 0.2248, df = 1, p-value = 0.6354
print(chisq.test(table(y,cat[,45])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 45])
## X-squared = 0.01545, df = 1, p-value = 0.9011
print(chisq.test(table(y,cat[,46])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 46])
## X-squared = 0.17519, df = 1, p-value = 0.6755
print(chisq.test(table(y,cat[,47])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 47])
## X-squared = 1.7905, df = 1, p-value = 0.1809
print(chisq.test(table(y,cat[,48])))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(y, cat[, 48])
## X-squared = 0.66851, df = 1, p-value = 0.4136

not all variables show a dependent relation with outcome

Analysis on numerical variables studying relationship between outcome variable and explanatory variables

num <- dataset[,c(21:23,36:42)]

correlation

cornum <- cor(num,use="complete.obs",method = "spearman")
summary(cornum[upper.tri(cornum)])
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.0520355  0.0002341  0.0011868  0.1075220  0.1059585  0.6844952
corrplot(cornum, type="lower", tl.col = "black", diag=FALSE, method="number")

corrplot shows not high correlation between numerical variables

anova one way

fit1 <- glm(y ~., data=num ,na.action = na.omit,family = binomial(logit))
par(mfrow=c(2,2))
plot(fit1)

Anova(fit1, test.statistic="Wald",type=3,singular.ok=TRUE)
## Analysis of Deviance Table (Type III tests)
## 
## Response: y
##             Df     Chisq Pr(>Chisq)    
## (Intercept)  1 1868.2474  < 2.2e-16 ***
## ps_reg_01    1    1.5207     0.2175    
## ps_reg_02    1   21.2829  3.963e-06 ***
## ps_reg_03    1   21.5484  3.450e-06 ***
## ps_car_12    1   65.4737  5.890e-16 ***
## ps_car_13    1  186.6955  < 2.2e-16 ***
## ps_car_14    1  116.0678  < 2.2e-16 ***
## ps_car_15    1   41.7664  1.029e-10 ***
## ps_calc_01   1    0.7684     0.3807    
## ps_calc_02   1    0.0299     0.8626    
## ps_calc_03   1    1.7033     0.1919    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

not all variables seem relevant to explain the outcome

multivariate analysis requires more time to run; otherwise it could be done by sparklyr or H2O

management of missing values

under 10% I can impute manually missing values using central value or the most frequent because I don’t change the distribution

for categorical variables I’m putting mode, for numerical variables median

dataset$ps_ind_02_cat[which(is.na(dataset$ps_ind_02_cat))] = 1
dataset$ps_ind_04_cat[which(is.na(dataset$ps_ind_04_cat))] = 0
dataset$ps_ind_05_cat[which(is.na(dataset$ps_ind_05_cat))] = 0
dataset$ps_car_01_cat[which(is.na(dataset$ps_car_01_cat))] = 11
dataset$ps_car_02_cat[which(is.na(dataset$ps_car_02_cat))] = 1
dataset$ps_car_07_cat[which(is.na(dataset$ps_car_07_cat))] = 1
dataset$ps_car_09_cat[which(is.na(dataset$ps_car_09_cat))] = 2
dataset$ps_car_11[which(is.na(dataset$ps_car_11))] = 2
dataset$ps_car_12[which(is.na(dataset$ps_car_12))] = 0.3742   
dataset$ps_car_14[which(is.na(dataset$ps_car_14))] = 0.37  

impute missing values using caret with “bagImpute” to the variable with more 10% of missing values using the most representative feature, in this case I take the feature most correlated with the outcome

impute on ps_reg_03 variable

train1 <- dataset[,c(2,21:23,36:42)]
train1 <- as.data.frame(train1)

looking for the most feature correlated with the outcome using mixed correlation by polycor package

cor <- hetcor(train1)
corrplot(as.matrix(cor),type="lower", tl.col = "black", diag=FALSE, method = "number")

building a little dataset with outcome, feature most correlated with the outcome and variable to impute missing values

train2 <- train1[,-c(2,5:11)]

First, transform all features to dummy variables

dummies <- dummyVars(target ~ ., data = train2)
train.dummy <- predict(dummies, newdata = train2)

Now, impute!

set.seed(2000)
pre.process <- preProcess(train.dummy, method = "bagImpute")
imputed.data <- predict(pre.process, train.dummy)
dataset$ps_reg_03 <- imputed.data[,2]
summary(dataset[,23])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.06124 0.59634 0.75746 0.85617 1.00964 4.42352

Pre-process to clean dataset

Correlation all dataset

should be used mixed correlation but it requires lot of time to run

reformatting some variables of dataset

dataset[,4] <- as.integer(dataset[,4])
dataset[,6] <- as.integer(dataset[,6])
dataset[,7] <- as.integer(dataset[,7])
dataset[,24] <- as.integer(dataset[,24])
dataset[,25] <- as.integer(dataset[,25])
dataset[,30] <- as.integer(dataset[,30])
dataset[,32] <- as.integer(dataset[,32])
dataset[,35] <- as.integer(dataset[,35])

before correlation I remove “id” variable and other two variables with more then 44% of missing values

dataset <- dataset[,-c(1,26,28)]
cor <- cor(dataset,use="complete.obs",method = "spearman")
summary(cor[upper.tri(cor)])
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.5996363 -0.0013436  0.0001581  0.0046242  0.0020814  0.9378869
highcor <- sum(abs(cor[upper.tri(cor)]) > .75)
highcor
## [1] 1
corrplot(cor, type="lower", tl.col = "black", diag=FALSE, method="number")

removing the feature with high correlation

dataset <- dataset[,-15]

Zero-and Near Zero-Variance Predictors and removing it

nzv <- nearZeroVar(dataset, saveMetrics= TRUE)
nzv[nzv$nzv,][1:15,]
##                freqRatio percentUnique zeroVar  nzv
## target          26.43671  0.0001344061   FALSE TRUE
## ps_ind_05_cat   25.71334  0.0004704213   FALSE TRUE
## ps_ind_10_bin 2680.13153  0.0001344061   FALSE TRUE
## ps_ind_11_bin  611.10531  0.0001344061   FALSE TRUE
## ps_ind_12_bin  105.37129  0.0001344061   FALSE TRUE
## ps_ind_13_bin  996.33780  0.0001344061   FALSE TRUE
## ps_car_10_cat  121.57713  0.0002016091   FALSE TRUE
## NA                    NA            NA      NA   NA
## NA.1                  NA            NA      NA   NA
## NA.2                  NA            NA      NA   NA
## NA.3                  NA            NA      NA   NA
## NA.4                  NA            NA      NA   NA
## NA.5                  NA            NA      NA   NA
## NA.6                  NA            NA      NA   NA
## NA.7                  NA            NA      NA   NA
nzv <- nearZeroVar(dataset)
dataset <- dataset[, -nzv[-1]]
dim(dataset)
## [1] 1488028      49
str(dataset)
## 'data.frame':    1488028 obs. of  49 variables:
##  $ target        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ ps_ind_01     : int  2 1 5 0 0 5 2 5 5 1 ...
##  $ ps_ind_02_cat : int  2 1 4 1 2 1 1 1 1 1 ...
##  $ ps_ind_03     : int  5 7 9 2 0 4 3 4 3 2 ...
##  $ ps_ind_04_cat : int  1 0 1 0 1 0 1 0 1 0 ...
##  $ ps_ind_06_bin : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ ps_ind_07_bin : int  1 0 0 0 0 0 1 0 0 1 ...
##  $ ps_ind_08_bin : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ ps_ind_09_bin : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ ps_ind_15     : int  11 3 12 8 9 6 8 13 6 4 ...
##  $ ps_ind_16_bin : int  0 0 1 1 1 1 1 1 1 0 ...
##  $ ps_ind_17_bin : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ps_ind_18_bin : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ ps_reg_01     : num  0.7 0.8 0 0.9 0.7 0.9 0.6 0.7 0.9 0.9 ...
##  $ ps_reg_02     : num  0.2 0.4 0 0.2 0.6 1.8 0.1 0.4 0.7 1.4 ...
##  $ ps_reg_03     : num  0.718 0.766 0.775 0.581 0.841 ...
##  $ ps_car_01_cat : int  10 11 7 7 11 10 6 11 10 11 ...
##  $ ps_car_02_cat : int  1 1 1 1 1 0 1 1 1 0 ...
##  $ ps_car_04_cat : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ ps_car_06_cat : int  4 11 14 11 14 14 11 11 14 14 ...
##  $ ps_car_07_cat : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_08_cat : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ ps_car_09_cat : int  0 2 2 3 2 0 0 2 0 2 ...
##  $ ps_car_11_cat : int  12 19 60 104 82 104 99 30 68 104 ...
##  $ ps_car_11     : int  2 3 1 1 3 2 2 3 3 2 ...
##  $ ps_car_12     : num  0.4 0.316 0.316 0.374 0.316 ...
##  $ ps_car_13     : num  0.884 0.619 0.642 0.543 0.566 ...
##  $ ps_car_14     : num  0.371 0.389 0.347 0.295 0.365 ...
##  $ ps_car_15     : num  3.61 2.45 3.32 2 2 ...
##  $ ps_calc_01    : num  0.6 0.3 0.5 0.6 0.4 0.7 0.2 0.1 0.9 0.7 ...
##  $ ps_calc_02    : num  0.5 0.1 0.7 0.9 0.6 0.8 0.6 0.5 0.8 0.8 ...
##  $ ps_calc_03    : num  0.2 0.3 0.1 0.1 0 0.4 0.5 0.1 0.6 0.8 ...
##  $ ps_calc_04    : int  3 2 2 2 2 3 2 1 3 2 ...
##  $ ps_calc_05    : int  1 1 2 4 2 1 2 2 1 2 ...
##  $ ps_calc_06    : int  10 9 9 7 6 8 8 7 7 8 ...
##  $ ps_calc_07    : int  1 5 1 1 3 2 1 1 3 2 ...
##  $ ps_calc_08    : int  10 8 8 8 10 11 8 6 9 9 ...
##  $ ps_calc_09    : int  1 1 2 4 2 3 3 1 4 1 ...
##  $ ps_calc_10    : int  5 7 7 2 12 8 10 13 11 11 ...
##  $ ps_calc_11    : int  9 3 4 2 3 4 3 7 4 3 ...
##  $ ps_calc_12    : int  1 1 2 2 1 2 0 1 2 5 ...
##  $ ps_calc_13    : int  5 1 7 4 1 0 0 3 1 0 ...
##  $ ps_calc_14    : int  8 9 7 9 3 9 10 6 5 6 ...
##  $ ps_calc_15_bin: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ ps_calc_16_bin: int  1 1 1 0 0 1 1 0 1 1 ...
##  $ ps_calc_17_bin: int  1 1 1 0 0 0 0 1 0 0 ...
##  $ ps_calc_18_bin: int  0 0 0 0 1 1 0 0 0 0 ...
##  $ ps_calc_19_bin: int  0 1 1 0 1 1 1 1 0 1 ...
##  $ ps_calc_20_bin: int  1 0 0 0 0 1 0 0 1 0 ...

collinearity

training <- dataset[1:595212,]
comboInfo <- findLinearCombos(training)
comboInfo
## $linearCombos
## list()
## 
## $remove
## NULL

save dataset for the next step

write.csv(dataset, "C:/Users/user/Desktop/Kaggle/Porto Seguro/dataset.csv", quote=F, na="", row.names=F)