library(tidyverse)
library(corrplot)
library(caret)
library(polycor)
library(car)
train <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/train.csv",na.strings=c("-1","-1.0"))
test <- read.csv("C:/Users/user/Desktop/Kaggle/Porto Seguro/test.csv",na.strings=c("-1","-1.0"))
dataset <- bind_rows(train,test)
summary(dataset[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 0 0 0 0 1 892816
table(dataset[,2])
##
## 0 1
## 573518 21694
table(dataset[,2])/length(train[,2])
##
## 0 1
## 0.96355248 0.03644752
miss_target <- sum(is.na(dataset[,2]) == TRUE)
miss_target #they are values to predict in the test dataset
## [1] 892816
summary(dataset[,8])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3934 1.0000 1.0000
table(dataset[,8])
##
## 0 1
## 902572 585456
table(dataset[,8])/length(dataset[,8])
##
## 0 1
## 0.6065558 0.3934442
miss_ps_ind_06_bin <- sum(is.na(dataset[,8]) == TRUE)
miss_ps_ind_06_bin
## [1] 0
summary(dataset[,9])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2571 1.0000 1.0000
table(dataset[,9])
##
## 0 1
## 1105415 382613
table(dataset[,9])/length(dataset[,9])
##
## 0 1
## 0.7428724 0.2571276
miss_ps_ind_07_bin <- sum(is.na(dataset[,9]) == TRUE)
miss_ps_ind_07_bin
## [1] 0
summary(dataset[,10])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1638 0.0000 1.0000
table(dataset[,10])
##
## 0 1
## 1244343 243685
table(dataset[,10])/length(dataset[,10])
##
## 0 1
## 0.8362363 0.1637637
miss_ps_ind_08_bin <- sum(is.na(dataset[,10]) == TRUE)
miss_ps_ind_08_bin
## [1] 0
summary(dataset[,10])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1638 0.0000 1.0000
table(dataset[,10])
##
## 0 1
## 1244343 243685
table(dataset[,10])/length(dataset[,10])
##
## 0 1
## 0.8362363 0.1637637
miss_ps_ind_08_bin <- sum(is.na(dataset[,10]) == TRUE)
miss_ps_ind_08_bin
## [1] 0
summary(dataset[,11])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1857 0.0000 1.0000
table(dataset[,11])
##
## 0 1
## 1211754 276274
miss_ps_ind_09_bin <- sum(is.na(dataset[,11]) == TRUE)
miss_ps_ind_09_bin
## [1] 0
summary(dataset[,12])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.000373 0.000000 1.000000
table(dataset[,12])
##
## 0 1
## 1487473 555
table(dataset[,12])/length(dataset[,12])
##
## 0 1
## 0.9996270231 0.0003729769
miss_ps_ind_10_bin <- sum(is.na(dataset[,12]) == TRUE)
miss_ps_ind_10_bin
## [1] 0
summary(dataset[,13])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.001634 0.000000 1.000000
table(dataset[,13])
##
## 0 1
## 1485597 2431
table(dataset[,13])/length(dataset[,13])
##
## 0 1
## 0.998366294 0.001633706
miss_ps_ind_11_bin <- sum(is.na(dataset[,13]) == TRUE)
miss_ps_ind_11_bin
## [1] 0
summary(dataset[,14])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.009401 0.000000 1.000000
table(dataset[,14])
##
## 0 1
## 1474039 13989
table(dataset[,14])/length(dataset[,14])
##
## 0 1
## 0.990598967 0.009401033
miss_ps_ind_12_bin <- sum(is.na(dataset[,14]) == TRUE)
miss_ps_ind_12_bin
## [1] 0
summary(dataset[,15])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.001003 0.000000 1.000000
table(dataset[,15])
##
## 0 1
## 1486536 1492
table(dataset[,15])/length(dataset[,15])
##
## 0 1
## 0.998997331 0.001002669
miss_ps_ind_13_bin <- sum(is.na(dataset[,15]) == TRUE)
miss_ps_ind_13_bin
## [1] 0
summary(dataset[,18])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.6607 1.0000 1.0000
table(dataset[,18])
##
## 0 1
## 504912 983116
table(dataset[,18])/length(dataset[,18])
##
## 0 1
## 0.3393162 0.6606838
miss_ps_ind_16_bin <- sum(is.na(dataset[,18]) == TRUE)
miss_ps_ind_16_bin
## [1] 0
summary(dataset[,19])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1207 0.0000 1.0000
table(dataset[,19])
##
## 0 1
## 1308465 179563
table(dataset[,19])/length(dataset[,19])
##
## 0 1
## 0.8793282 0.1206718
miss_ps_ind_17_bin <- sum(is.na(dataset[,19]) == TRUE)
miss_ps_ind_17_bin
## [1] 0
summary(dataset[,20])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1544 0.0000 1.0000
table(dataset[,20])
##
## 0 1
## 1258333 229695
table(dataset[,20])/length(dataset[,20])
##
## 0 1
## 0.845638 0.154362
miss_ps_ind_18_bin <- sum(is.na(dataset[,20]) == TRUE)
miss_ps_ind_18_bin
## [1] 0
summary(dataset[,54])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1232 0.0000 1.0000
table(dataset[,54])
##
## 0 1
## 1304699 183329
table(dataset[,54])/length(dataset[,54])
##
## 0 1
## 0.8767973 0.1232027
miss_ps_calc_15_bin <- sum(is.na(dataset[,54]) == TRUE)
miss_ps_calc_15_bin
## [1] 0
summary(dataset[,56])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5545 1.0000 1.0000
table(dataset[,56])
##
## 0 1
## 662963 825065
table(dataset[,56])/length(dataset[,56])
##
## 0 1
## 0.4455313 0.5544687
miss_ps_calc_17_bin <- sum(is.na(dataset[,56]) == TRUE)
miss_ps_calc_17_bin
## [1] 0
summary(dataset[,57])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2876 1.0000 1.0000
table(dataset[,57])
##
## 0 1
## 1060145 427883
table(dataset[,57])/length(dataset[,57])
##
## 0 1
## 0.7124496 0.2875504
miss_ps_calc_18_bin <- sum(is.na(dataset[,57]) == TRUE)
miss_ps_calc_18_bin
## [1] 0
summary(dataset[,58])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3492 1.0000 1.0000
table(dataset[,58])
##
## 0 1
## 968385 519643
table(dataset[,58])/length(dataset[,58])
##
## 0 1
## 0.6507841 0.3492159
miss_ps_calc_19_bin <- sum(is.na(dataset[,58]) == TRUE)
miss_ps_calc_19_bin
## [1] 0
summary(dataset[,59])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1528 0.0000 1.0000
table(dataset[,59])
##
## 0 1
## 1260681 227347
table(dataset[,59])/length(dataset[,59])
##
## 0 1
## 0.8472159 0.1527841
miss_ps_calc_20_bin <- sum(is.na(dataset[,59]) == TRUE)
miss_ps_calc_20_bin
## [1] 0
summary(dataset[,4])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 1.00 1.00 1.36 2.00 4.00 523
table(dataset[,4])
##
## 1 2 3 4
## 1079327 309747 70172 28259
table(dataset[,4])/length(dataset[,4])
##
## 1 2 3 4
## 0.72534052 0.20815939 0.04715771 0.01899091
miss_ps_ind_02_cat <- sum(is.na(dataset[,4]) == TRUE)
miss_ps_ind_02_cat
## [1] 523
miss_ps_ind_02_cat/length(dataset[,4])
## [1] 0.0003514719
summary(dataset[,6])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4173 1.0000 1.0000 228
table(dataset[,6])
##
## 0 1
## 866864 620936
table(dataset[,6])/length(dataset[,6])
##
## 0 1
## 0.5825589 0.4172878
miss_ps_ind_04_cat <- sum(is.na(dataset[,6]) == TRUE)
miss_ps_ind_04_cat
## [1] 228
miss_ps_ind_04_cat/length(dataset[,6])*100
## [1] 0.01532229
summary(dataset[,7])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.421 0.000 6.000 14519
table(dataset[,7])
##
## 0 1 2 3 4 5 6
## 1319412 20737 10707 20754 45706 4316 51877
table(dataset[,7])/length(dataset[,7])
##
## 0 1 2 3 4 5
## 0.886684928 0.013935894 0.007195429 0.013947318 0.030715820 0.002900483
## 6
## 0.034862919
miss_ps_ind_05_cat <- sum(is.na(dataset[,7]) == TRUE)
miss_ps_ind_05_cat
## [1] 14519
miss_ps_ind_05_cat/length(dataset[,7])*100
## [1] 0.9757209
summary(dataset[,24])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 7.000 7.000 8.295 11.000 11.000 267
table(dataset[,24])
##
## 0 1 2 3 4 5 6 7 8 9
## 14844 3379 5242 16682 65720 45082 155779 449617 37603 50501
## 10 11
## 124587 518725
table(dataset[,24])/length(dataset[,24])
##
## 0 1 2 3 4 5
## 0.009975619 0.002270791 0.003522783 0.011210811 0.044165836 0.030296473
## 6 7 8 9 10 11
## 0.104688218 0.302156277 0.025270358 0.033938205 0.083726247 0.348598951
miss_ps_car_01_cat <- sum(is.na(dataset[,24]) == TRUE)
miss_ps_car_01_cat
## [1] 267
miss_ps_car_01_cat/length(dataset[,24])*100
## [1] 0.01794321
summary(dataset[,25])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 1.0000 1.0000 0.8299 1.0000 1.0000 10
table(dataset[,25])
##
## 0 1
## 253039 1234979
table(dataset[,25])/length(dataset[,25])
##
## 0 1
## 0.1700499 0.8299434
miss_ps_car_02_cat <- sum(is.na(dataset[,25]) == TRUE)
miss_ps_car_02_cat
## [1] 10
miss_ps_car_02_cat/length(dataset[,25])*100
## [1] 0.0006720304
summary(dataset[,26])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 1.0 0.6 1.0 1.0 1028142
table(dataset[,26])
##
## 0 1
## 183044 276842
table(dataset[,26])/length(dataset[,26])
##
## 0 1
## 0.1230111 0.1860462
miss_ps_car_03_cat <- sum(is.na(dataset[,26]) == TRUE)
miss_ps_car_03_cat
## [1] 1028142
miss_ps_car_03_cat/length(dataset[,26])*100
## [1] 69.09426
summary(dataset[,27])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7256 0.0000 9.0000
table(dataset[,27])
##
## 0 1 2 3 4 5 6 7 8
## 1241334 80561 59088 1713 627 1330 3937 370 51211
## 9
## 47857
table(dataset[,27])/length(dataset[,27])
##
## 0 1 2 3 4
## 0.8342141411 0.0541394382 0.0397089302 0.0011511880 0.0004213630
## 5 6 7 8 9
## 0.0008938004 0.0026457835 0.0002486512 0.0344153470 0.0321613572
miss_ps_car_04_cat <- sum(is.na(dataset[,27]) == TRUE)
miss_ps_car_04_cat
## [1] 0
summary(dataset[,28])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 1.0 0.5 1.0 1.0 666910
table(dataset[,28])
##
## 0 1
## 389558 431560
table(dataset[,28])/length(dataset[,28])
##
## 0 1
## 0.2617948 0.2900214
miss_ps_car_05_cat <- sum(is.na(dataset[,28]) == TRUE)
miss_ps_car_05_cat
## [1] 666910
miss_ps_car_05_cat/length(dataset[,28])*100
## [1] 44.81838
summary(dataset[,29])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 7.000 6.561 11.000 17.000
table(dataset[,29])
##
## 0 1 2 3 4 5 6 7 8 9
## 275497 295574 4123 30031 77845 3479 52571 40643 3471 43810
## 10 11 12 13 14 15 16 17
## 83563 329890 5991 15356 147714 54151 11771 12548
table(dataset[,29])/length(dataset[,29])
##
## 0 1 2 3 4 5
## 0.185142349 0.198634703 0.002770781 0.020181744 0.052314204 0.002337994
## 6 7 8 9 10 11
## 0.035329308 0.027313330 0.002332617 0.029441650 0.056156873 0.221696097
## 12 13 14 15 16 17
## 0.004026134 0.010319698 0.099268293 0.036391116 0.007910469 0.008432637
miss_ps_car_06_cat <- sum(is.na(dataset[,29]) == TRUE)
miss_ps_car_06_cat
## [1] 0
summary(dataset[,30])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 1.000 0.948 1.000 1.000 28820
table(dataset[,30])
##
## 0 1
## 76138 1383070
table(dataset[,30])/length(dataset[,30])
##
## 0 1
## 0.05116705 0.92946504
miss_ps_car_07_cat <- sum(is.na(dataset[,30]) == TRUE)
miss_ps_car_07_cat
## [1] 28820
miss_ps_car_07_cat/length(dataset[,30])*100
## [1] 1.936792
summary(dataset[,31])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.8322 1.0000 1.0000
table(dataset[,31])
##
## 0 1
## 249663 1238365
table(dataset[,31])/length(dataset[,31])
##
## 0 1
## 0.1677811 0.8322189
miss_ps_car_08_cat <- sum(is.na(dataset[,31]) == TRUE)
miss_ps_car_08_cat
## [1] 0
summary(dataset[,32])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 2.000 1.331 2.000 4.000 1446
table(dataset[,32])
##
## 0 1 2 3 4
## 486510 72947 883326 36798 7001
table(dataset[,32])/length(dataset[,32])
##
## 0 1 2 3 4
## 0.326949493 0.049022599 0.593621894 0.024729373 0.004704885
miss_ps_car_09_cat <- sum(is.na(dataset[,32]) == TRUE)
miss_ps_car_09_cat
## [1] 1446
miss_ps_car_09_cat/length(dataset[,32])*100
## [1] 0.09717559
summary(dataset[,33])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.9921 1.0000 2.0000
table(dataset[,33])
##
## 0 1 2
## 12136 1475460 432
table(dataset[,33])/length(dataset[,33])
##
## 0 1 2
## 0.0081557605 0.9915539224 0.0002903171
miss_ps_car_10_cat <- sum(is.na(dataset[,33]) == TRUE)
miss_ps_car_10_cat
## [1] 0
summary(dataset[,34])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 32.00 65.00 62.26 94.00 104.00
table(dataset[,34])
##
## 1 2 3 4 5 6 7 8 9 10
## 8228 6379 7992 3894 31198 6071 14249 6158 5457 21742
## 11 12 13 14 15 16 17 18 19 20
## 16667 18326 8371 6727 4645 17716 9725 3066 12535 4530
## 21 22 23 24 25 26 27 28 29 30
## 6070 20926 4072 5857 11658 8724 14912 31344 10957 5891
## 31 32 33 34 35 36 37 38 39 40
## 13143 31175 4611 11774 3821 7372 12623 22952 19211 12391
## 41 42 43 44 45 46 47 48 49 50
## 7963 12336 8261 14231 4316 13351 5044 15198 13672 4475
## 51 52 53 54 55 56 57 58 59 60
## 14987 8811 11658 4022 10565 3175 9382 3767 4479 19943
## 61 62 63 64 65 66 67 68 69 70
## 6886 7082 2722 55391 27800 7877 17806 20733 4553 18927
## 71 72 73 74 75 76 77 78 79 80
## 5933 7215 4614 12098 4343 6543 6142 18327 5190 14053
## 81 82 83 84 85 86 87 88 89 90
## 3608 26161 23506 12276 15838 9282 42872 11213 12730 13030
## 91 92 93 94 95 96 97 98 99 100
## 4553 16344 7220 8364 8972 5116 3692 7118 30303 11200
## 101 102 103 104
## 18416 5132 61062 212989
table(dataset[,34])/length(dataset[,34])
##
## 1 2 3 4 5 6
## 0.005529466 0.004286882 0.005370867 0.002616886 0.020966003 0.004079896
## 7 8 9 10 11 12
## 0.009575761 0.004138363 0.003667270 0.014611284 0.011200730 0.012315628
## 13 14 15 16 17 18
## 0.005625566 0.004520748 0.003121581 0.011905690 0.006535495 0.002060445
## 19 20 21 22 23 24
## 0.008423901 0.003044298 0.004079224 0.014062907 0.002736508 0.003936082
## 25 26 27 28 29 30
## 0.007834530 0.005862793 0.010021317 0.021064120 0.007363437 0.003958931
## 31 32 33 34 35 36
## 0.008832495 0.020950547 0.003098732 0.007912486 0.002567828 0.004954208
## 37 38 39 40 41 42
## 0.008483039 0.015424441 0.012910375 0.008327128 0.005351378 0.008290167
## 43 44 45 46 47 48
## 0.005551643 0.009563664 0.002900483 0.008972277 0.003389721 0.010213517
## 49 50 51 52 53 54
## 0.009187999 0.003007336 0.010071719 0.005921260 0.007834530 0.002702906
## 55 56 57 58 59 60
## 0.007100001 0.002133696 0.006304989 0.002531538 0.003010024 0.013402302
## 61 62 63 64 65 66
## 0.004627601 0.004759319 0.001829267 0.037224434 0.018682444 0.005293583
## 67 68 69 70 71 72
## 0.011966173 0.013933206 0.003059754 0.012719519 0.003987156 0.004848699
## 73 74 75 76 77 78
## 0.003100748 0.008130223 0.002918628 0.004397095 0.004127611 0.012316300
## 79 80 81 82 83 84
## 0.003487838 0.009444043 0.002424686 0.017580986 0.015796746 0.008249845
## 85 86 87 88 89 90
## 0.010643617 0.006237786 0.028811286 0.007535476 0.008554947 0.008756556
## 91 92 93 94 95 96
## 0.003059754 0.010983664 0.004852059 0.005620862 0.006029456 0.003438107
## 97 98 99 100 101 102
## 0.002481136 0.004783512 0.020364536 0.007526740 0.012376111 0.003448860
## 103 104
## 0.041035518 0.143135075
miss_ps_car_11_cat <- sum(is.na(dataset[,34]) == TRUE)
miss_ps_car_11_cat
## [1] 0
summary(dataset[,3])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.902 3.000 7.000
table(dataset[,3])
##
## 0 1 2 3 4 5 6 7
## 469109 359925 205761 127634 84045 153663 44486 43405
table(dataset[,3])/length(dataset[,3])
##
## 0 1 2 3 4 5
## 0.31525549 0.24188053 0.13827764 0.08577392 0.05648079 0.10326620
## 6 7
## 0.02989594 0.02916948
miss_ps_ind_01 <- sum(is.na(dataset[,3]) == TRUE)
miss_ps_ind_01
## [1] 0
summary(dataset[,5])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 4.000 4.418 6.000 11.000
table(dataset[,5])
##
## 0 1 2 3 4 5 6 7 8 9
## 38878 170922 240652 204836 167980 157043 150987 130452 98528 63551
## 10 11
## 35863 28336
table(dataset[,5])/length(dataset[,5])
##
## 0 1 2 3 4 5
## 0.02612720 0.11486477 0.16172545 0.13765601 0.11288766 0.10553766
## 6 7 8 9 10 11
## 0.10146785 0.08766771 0.06621381 0.04270820 0.02410102 0.01904265
miss_ps_ind_03 <- sum(is.na(dataset[,5]) == TRUE)
miss_ps_ind_03
## [1] 0
summary(dataset[,16])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01241 0.00000 4.00000
table(dataset[,16])
##
## 0 1 2 3 4
## 1472125 13707 1847 330 19
table(dataset[,16])/length(dataset[,16])
##
## 0 1 2 3 4
## 9.893127e-01 9.211520e-03 1.241240e-03 2.217700e-04 1.276858e-05
miss_ps_ind_14 <- sum(is.na(dataset[,16]) == TRUE)
miss_ps_ind_14
## [1] 0
summary(dataset[,17])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.000 7.000 7.298 10.000 13.000
table(dataset[,17])
##
## 0 1 2 3 4 5 6 7 8 9
## 78880 29476 46951 81057 104192 106079 145977 163865 149445 113902
## 10 11 12 13
## 135354 133059 112023 87768
table(dataset[,17])/length(dataset[,17])
##
## 0 1 2 3 4 5
## 0.05300976 0.01980877 0.03155250 0.05447277 0.07002019 0.07128831
## 6 7 8 9 10 11
## 0.09810098 0.11012226 0.10043158 0.07654560 0.09096200 0.08941969
## 12 13
## 0.07528286 0.05898276
miss_ps_ind_15 <- sum(is.na(dataset[,17]) == TRUE)
miss_ps_ind_15
## [1] 0
summary(dataset[,35])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 3.000 2.346 3.000 3.000 6
table(dataset[,35])
##
## 0 1 2 3
## 64634 152661 473339 797388
table(dataset[,35])/length(dataset[,35])
##
## 0 1 2 3
## 0.04343601 0.10259283 0.31809818 0.53586895
miss_ps_car_11 <- sum(is.na(dataset[,35]) == TRUE)
miss_ps_car_11
## [1] 6
miss_ps_car_11/length(dataset[,35])*100
## [1] 0.0004032182
summary(dataset[,43])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 2.000 2.372 3.000 5.000
table(dataset[,43])
##
## 0 1 2 3 4 5
## 59597 270227 485574 438576 198267 35787
table(dataset[,43])/length(dataset[,43])
##
## 0 1 2 3 4 5
## 0.04005099 0.18160075 0.32632047 0.29473639 0.13324144 0.02404995
miss_ps_calc_04 <- sum(is.na(dataset[,43]) == TRUE)
miss_ps_calc_04
## [1] 0
summary(dataset[,44])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 1.886 3.000 6.000
table(dataset[,44])
##
## 0 1 2 3 4 5 6
## 153919 426455 487503 297928 102032 18766 1425
table(dataset[,44])/length(dataset[,44])
##
## 0 1 2 3 4
## 0.1034382418 0.2865907093 0.3276168190 0.2002166626 0.0685686022
## 5 6
## 0.0126113218 0.0009576433
miss_ps_calc_05 <- sum(is.na(dataset[,44]) == TRUE)
miss_ps_calc_05
## [1] 0
summary(dataset[,45])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 7.000 8.000 7.688 9.000 10.000
table(dataset[,45])
##
## 0 1 2 3 4 5 6 7 8 9
## 1 26 317 2913 16706 66566 184040 350227 437098 322361
## 10
## 107773
table(dataset[,45])/length(dataset[,45])
##
## 0 1 2 3 4
## 6.720304e-07 1.747279e-05 2.130336e-04 1.957624e-03 1.122694e-02
## 5 6 7 8 9
## 4.473437e-02 1.236805e-01 2.353632e-01 2.937431e-01 2.166364e-01
## 10
## 7.242673e-02
miss_ps_calc_06 <- sum(is.na(dataset[,45]) == TRUE)
miss_ps_calc_06
## [1] 0
summary(dataset[,46])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 3.008 4.000 9.000
table(dataset[,46])
##
## 0 1 2 3 4 5 6 7 8 9
## 38224 172836 346628 406921 305796 153767 51322 11082 1383 69
table(dataset[,46])/length(dataset[,46])
##
## 0 1 2 3 4
## 0.0256876887 0.1161510402 0.2329445414 0.2734632682 0.2055041975
## 5 6 7 8 9
## 0.1033360931 0.0344899424 0.0074474405 0.0009294180 0.0000463701
miss_ps_calc_07 <- sum(is.na(dataset[,46]) == TRUE)
miss_ps_calc_07
## [1] 0
summary(dataset[,47])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 8.000 9.000 9.226 10.000 12.000
table(dataset[,47])
##
## 1 2 3 4 5 6 7 8 9 10
## 2 29 265 2111 11180 43098 123797 257182 378846 379246
## 11 12
## 228801 63471
table(dataset[,47])/length(dataset[,47])
##
## 1 2 3 4 5
## 1.344061e-06 1.948888e-05 1.780880e-04 1.418656e-03 7.513299e-03
## 6 7 8 9 10
## 2.896316e-02 8.319534e-02 1.728341e-01 2.545960e-01 2.548648e-01
## 11 12
## 1.537612e-01 4.265444e-02
miss_ps_calc_08 <- sum(is.na(dataset[,47]) == TRUE)
miss_ps_calc_08
## [1] 0
summary(dataset[,48])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 2.339 3.000 7.000
table(dataset[,48])
##
## 0 1 2 3 4 5 6 7
## 86476 303295 456770 381570 191957 57710 9568 682
table(dataset[,48])/length(dataset[,48])
##
## 0 1 2 3 4
## 0.0581144978 0.2038234496 0.3069633098 0.2564266264 0.1290009328
## 5 6 7
## 0.0387828724 0.0064299865 0.0004583247
miss_ps_calc_09 <- sum(is.na(dataset[,48]) == TRUE)
miss_ps_calc_09
## [1] 0
summary(dataset[,49])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 8.000 8.439 10.000 25.000
table(dataset[,49])
##
## 0 1 2 3 4 5 6 7 8 9
## 309 2652 11471 32179 67995 115159 161555 194499 205217 192144
## 10 11 12 13 14 15 16 17 18 19
## 163008 124002 87453 57088 34429 19481 10180 5135 2348 1011
## 20 21 22 23 24 25
## 440 182 68 16 4 3
table(dataset[,49])/length(dataset[,49])
##
## 0 1 2 3 4
## 2.076574e-04 1.782225e-03 7.708860e-03 2.162527e-02 4.569470e-02
## 5 6 7 8 9
## 7.739034e-02 1.085699e-01 1.307092e-01 1.379121e-01 1.291266e-01
## 10 11 12 13 14
## 1.095463e-01 8.333311e-02 5.877107e-02 3.836487e-02 2.313733e-02
## 15 16 17 18 19
## 1.309182e-02 6.841269e-03 3.450876e-03 1.577927e-03 6.794227e-04
## 20 21 22 23 24
## 2.956934e-04 1.223095e-04 4.569806e-05 1.075249e-05 2.688121e-06
## 25
## 2.016091e-06
miss_ps_calc_10 <- sum(is.na(dataset[,49]) == TRUE)
miss_ps_calc_10
## [1] 0
summary(dataset[,50])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 4.00 5.00 5.44 7.00 20.00
table(dataset[,50])
##
## 0 1 2 3 4 5 6 7 8 9
## 6329 35157 95733 173074 235407 256571 232989 180725 122227 74547
## 10 11 12 13 14 15 16 17 18 19
## 40262 19962 8978 3801 1471 524 190 51 23 6
## 20
## 1
table(dataset[,50])/length(dataset[,50])
##
## 0 1 2 3 4
## 4.253280e-03 2.362657e-02 6.433548e-02 1.163110e-01 1.582007e-01
## 5 6 7 8 9
## 1.724235e-01 1.565757e-01 1.214527e-01 8.214026e-02 5.009785e-02
## 10 11 12 13 14
## 2.705729e-02 1.341507e-02 6.033489e-03 2.554387e-03 9.885567e-04
## 15 16 17 18 19
## 3.521439e-04 1.276858e-04 3.427355e-05 1.545670e-05 4.032182e-06
## 20
## 6.720304e-07
miss_ps_calc_11 <- sum(is.na(dataset[,50]) == TRUE)
miss_ps_calc_11
## [1] 0
summary(dataset[,51])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.441 2.000 11.000
table(dataset[,51])
##
## 0 1 2 3 4 5 6 7 8 9
## 352711 507536 364715 175675 63648 18241 4383 937 149 29
## 10 11
## 3 1
table(dataset[,51])/length(dataset[,51])
##
## 0 1 2 3 4
## 2.370325e-01 3.410796e-01 2.450996e-01 1.180589e-01 4.277339e-02
## 5 6 7 8 9
## 1.225851e-02 2.945509e-03 6.296925e-04 1.001325e-04 1.948888e-05
## 10 11
## 2.016091e-06 6.720304e-07
miss_ps_calc_12 <- sum(is.na(dataset[,51]) == TRUE)
miss_ps_calc_12
## [1] 0
summary(dataset[,52])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 2.874 4.000 15.000
table(dataset[,52])
##
## 0 1 2 3 4 5 6 7 8 9
## 84026 241100 347605 332214 239373 136916 65687 27229 9685 3032
## 10 11 12 13 14 15
## 889 209 51 9 2 1
table(dataset[,52])/length(dataset[,52])
##
## 0 1 2 3 4
## 5.646802e-02 1.620265e-01 2.336011e-01 2.232579e-01 1.608659e-01
## 5 6 7 8 9
## 9.201171e-02 4.414366e-02 1.829871e-02 6.508614e-03 2.037596e-03
## 10 11 12 13 14
## 5.974350e-04 1.404543e-04 3.427355e-05 6.048273e-06 1.344061e-06
## 15
## 6.720304e-07
miss_ps_calc_13 <- sum(is.na(dataset[,52]) == TRUE)
miss_ps_calc_13
## [1] 0
summary(dataset[,53])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 7.00 7.54 9.00 28.00
table(dataset[,53])
##
## 0 1 2 3 4 5 6 7 8 9
## 776 5875 22460 56722 106204 160704 201962 217553 204937 171448
## 10 11 12 13 14 15 16 17 18 19
## 129415 88435 55895 32216 17461 8674 4086 1913 778 326
## 20 21 22 23 28
## 114 49 20 4 1
table(dataset[,53])/length(dataset[,53])
##
## 0 1 2 3 4
## 5.214956e-04 3.948178e-03 1.509380e-02 3.811891e-02 7.137231e-02
## 5 6 7 8 9
## 1.079980e-01 1.357246e-01 1.462022e-01 1.377239e-01 1.152183e-01
## 10 11 12 13 14
## 8.697081e-02 5.943101e-02 3.756314e-02 2.165013e-02 1.173432e-02
## 15 16 17 18 19
## 5.829191e-03 2.745916e-03 1.285594e-03 5.228396e-04 2.190819e-04
## 20 21 22 23 28
## 7.661146e-05 3.292949e-05 1.344061e-05 2.688121e-06 6.720304e-07
miss_ps_calc_14 <- sum(is.na(dataset[,53]) == TRUE)
miss_ps_calc_14
## [1] 0
summary(dataset[,21])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.400 0.700 0.611 0.900 0.900
miss_ps_reg_01 <- sum(is.na(dataset[,21]) == TRUE)
miss_ps_reg_01
## [1] 0
sd(dataset[,21])
## [1] 0.2876763
var(dataset[,21])
## [1] 0.08275764
skew_ps_reg_01 <- mean((dataset[,21] - mean(dataset[,21]))^3/sd(dataset[,21])^3)
skew_ps_reg_01
## [1] -0.6404163
kurtosys_ps_reg_01 <- mean((dataset[,21] - mean(dataset[,21]))^4/sd(dataset[,21])^4) - 3
kurtosys_ps_reg_01
## [1] -0.8847695
boxplot.stats(dataset[,21])$out
## numeric(0)
summary(dataset[,22])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2000 0.3000 0.4396 0.6000 1.8000
miss_ps_reg_02 <- sum(is.na(dataset[,22]) == TRUE)
miss_ps_reg_02
## [1] 0
sd(dataset[,22])
## [1] 0.4045123
var(dataset[,22])
## [1] 0.1636302
skew_ps_reg_02 <- mean((dataset[,22] - mean(dataset[,22]))^3/sd(dataset[,22])^3)
skew_ps_reg_02
## [1] 1.280006
kurtosys_ps_reg_02 <- mean((dataset[,22] - mean(dataset[,22]))^4/sd(dataset[,22])^4) - 3
kurtosys_ps_reg_02
## [1] 1.118253
max(boxplot.stats(dataset[,22])$out)
## [1] 1.8
min(boxplot.stats(dataset[,22])$out)
## [1] 1.3
sum(table(boxplot.stats(dataset[,22])$out))
## [1] 92219
x <- dataset[,22]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75%
## 0.2 0.6
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
## 5% 95%
## 0.0 1.3
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.6
out1 <- dataset$ps_reg_02[dataset$ps_reg_02 < (qnt[1] - H)]
out2 <- dataset$ps_reg_02[dataset$ps_reg_02 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 0
sum(!is.na(out2))
## [1] 92219
sum(!is.na(out2))/length(dataset$ps_reg_02)
## [1] 0.06197397
summary(dataset[,23])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.06 0.63 0.80 0.89 1.09 4.42 269456
miss_ps_reg_03 <- sum(is.na(dataset[,23]) == TRUE)
miss_ps_reg_03
## [1] 269456
miss_ps_reg_03/length(dataset[,23])
## [1] 0.1810826
sd(dataset[,23])
## [1] NA
var(dataset[,23])
## [1] NA
skew_ps_reg_03 <- mean((dataset[,23] - mean(dataset[,23]))^3/sd(dataset[,23])^3)
skew_ps_reg_03
## [1] NA
kurtosys_ps_reg_03 <- mean((dataset[,23] - mean(dataset[,23]))^4/sd(dataset[,23])^4) - 3
kurtosys_ps_reg_03
## [1] NA
max(boxplot.stats(dataset[,23])$out)
## [1] 4.423517
min(boxplot.stats(dataset[,23])$out)
## [1] 1.763874
sum(table(boxplot.stats(dataset[,23])$out))
## [1] 26192
x <- dataset[,23]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75%
## 0.6339361 1.0854147
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
## 5% 95%
## 0.4911721 1.5858752
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.6772178
out1 <- dataset$ps_reg_03[dataset$ps_reg_03 < (qnt[1] - H)]
out2 <- dataset$ps_reg_03[dataset$ps_reg_03 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 0
sum(!is.na(out2))
## [1] 26192
sum(!is.na(out2))/length(dataset$ps_reg_03)
## [1] 0.01760182
summary(dataset[,36])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1000 0.3162 0.3742 0.3800 0.4000 1.2649 1
miss_ps_car_12 <- sum(is.na(dataset[,36]) == TRUE)
miss_ps_car_12
## [1] 1
miss_ps_car_12/length(dataset[,36])
## [1] 6.720304e-07
sd(dataset[,36])
## [1] NA
var(dataset[,36])
## [1] NA
skew_ps_car_12 <- mean((dataset[,36] - mean(dataset[,36]))^3/sd(dataset[,36])^3)
skew_ps_car_12
## [1] NA
kurtosys_ps_car_12 <- mean((dataset[,36] - mean(dataset[,36]))^4/sd(dataset[,36])^4) - 3
kurtosys_ps_car_12
## [1] NA
max(boxplot.stats(dataset[,36])$out)
## [1] 1.264911
min(boxplot.stats(dataset[,36])$out)
## [1] 0.1
sum(table(boxplot.stats(dataset[,36])$out))
## [1] 38618
x <- dataset[,36]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75%
## 0.3162278 0.4000000
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
## 5% 95%
## 0.3160696 0.4688283
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.1256584
out1 <- dataset$ps_car_12[dataset$ps_car_12 < (qnt[1] - H)]
out2 <- dataset$ps_car_12[dataset$ps_car_12 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 140
sum(!is.na(out1))/length(dataset$ps_car_12)
## [1] 9.408425e-05
sum(!is.na(out2))
## [1] 38478
sum(!is.na(out2))/length(dataset$ps_car_12)
## [1] 0.02585838
summary(dataset[,37])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2506 0.6710 0.7660 0.8135 0.9061 4.0313
miss_ps_car_13 <- sum(is.na(dataset[,37]) == TRUE)
miss_ps_car_13
## [1] 0
sd(dataset[,37])
## [1] 0.2247024
var(dataset[,37])
## [1] 0.05049117
skew_ps_car_13 <- mean((dataset[,37] - mean(dataset[,37]))^3/sd(dataset[,37])^3)
skew_ps_car_13
## [1] 1.697218
kurtosys_ps_car_13 <- mean((dataset[,37] - mean(dataset[,37]))^4/sd(dataset[,37])^4) - 3
kurtosys_ps_car_13
## [1] 5.392546
max(boxplot.stats(dataset[,37])$out)
## [1] 4.031301
min(boxplot.stats(dataset[,37])$out)
## [1] 0.2506191
sum(table(boxplot.stats(dataset[,37])$out))
## [1] 67765
x <- dataset[,37]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75%
## 0.6710052 0.9061429
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
## 5% 95%
## 0.5426823 1.2368899
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.3527067
out1 <- dataset$ps_car_13[dataset$ps_car_13 < (qnt[1] - H)]
out2 <- dataset$ps_car_13[dataset$ps_car_13 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 36
sum(!is.na(out1))/length(dataset$ps_car_13)
## [1] 2.419309e-05
sum(!is.na(out2))
## [1] 67729
sum(!is.na(out2))/length(dataset$ps_car_13)
## [1] 0.04551594
summary(dataset[,38])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.11 0.35 0.37 0.37 0.40 0.64 106425
miss_ps_car_14 <- sum(is.na(dataset[,38]) == TRUE)
miss_ps_car_14
## [1] 106425
miss_ps_car_14/length(dataset[,38])
## [1] 0.07152083
sd(dataset[,38])
## [1] NA
var(dataset[,38])
## [1] NA
skew_ps_car_14 <- mean((dataset[,38] - mean(dataset[,38]))^3/sd(dataset[,38])^3)
skew_ps_car_14
## [1] NA
kurtosys_ps_car_14 <- mean((dataset[,38] - mean(dataset[,38]))^4/sd(dataset[,38])^4) - 3
kurtosys_ps_car_14
## [1] NA
max(boxplot.stats(dataset[,38])$out)
## [1] 0.6363961
min(boxplot.stats(dataset[,38])$out)
## [1] 0.1095445
sum(table(boxplot.stats(dataset[,38])$out))
## [1] 46945
x <- dataset[,38]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75%
## 0.3504283 0.3977436
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
## 5% 95%
## 0.3016621 0.4440721
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 0.07097299
out1 <- dataset$ps_car_14[dataset$ps_car_14 < (qnt[1] - H)]
out2 <- dataset$ps_car_14[dataset$ps_car_14 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 2302
sum(!is.na(out1))/length(dataset$ps_car_14)
## [1] 0.001547014
sum(!is.na(out2))
## [1] 44643
sum(!is.na(out2))/length(dataset$ps_car_14)
## [1] 0.03000145
summary(dataset[,39])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.828 3.317 3.067 3.606 3.742
miss_ps_car_15 <- sum(is.na(dataset[,39]) == TRUE)
miss_ps_car_15
## [1] 0
sd(dataset[,39])
## [1] 0.729951
var(dataset[,39])
## [1] 0.5328285
skew_ps_car_15 <- mean((dataset[,39] - mean(dataset[,39]))^3/sd(dataset[,39])^3)
skew_ps_car_15
## [1] -2.220883
kurtosys_ps_car_15 <- mean((dataset[,39] - mean(dataset[,39]))^4/sd(dataset[,39])^4) - 3
kurtosys_ps_car_15
## [1] 5.908309
max(boxplot.stats(dataset[,39])$out)
## [1] 1.414214
min(boxplot.stats(dataset[,39])$out)
## [1] 0
sum(table(boxplot.stats(dataset[,39])$out))
## [1] 68172
x <- dataset[,39]
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
qnt
## 25% 75%
## 2.828427 3.605551
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
caps
## 5% 95%
## 1.732051 3.741657
H <- 1.5 * IQR(x, na.rm = T)
H
## [1] 1.165686
out1 <- dataset$ps_car_15[dataset$ps_car_15 < (qnt[1] - H)]
out2 <- dataset$ps_car_15[dataset$ps_car_15 > (qnt[2] + H)]
sum(!is.na(out1))
## [1] 68172
sum(!is.na(out1))/length(dataset$ps_car_15)
## [1] 0.04581365
sum(!is.na(out2))
## [1] 0
summary(dataset[,40])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2000 0.4000 0.4497 0.7000 0.9000
miss_ps_calc_01 <- sum(is.na(dataset[,40]) == TRUE)
miss_ps_calc_01
## [1] 0
sd(dataset[,40])
## [1] 0.2872071
var(dataset[,40])
## [1] 0.0824879
skew_ps_calc_01 <- mean((dataset[,40] - mean(dataset[,40]))^3/sd(dataset[,40])^3)
skew_ps_calc_01
## [1] 0.0007329376
kurtosys_ps_calc_01 <- mean((dataset[,40] - mean(dataset[,40]))^4/sd(dataset[,40])^4) - 3
kurtosys_ps_calc_01
## [1] -1.22389
boxplot.stats(dataset[,40])$out
## numeric(0)
summary(dataset[,41])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2000 0.5000 0.4501 0.7000 0.9000
miss_ps_calc_02 <- sum(is.na(dataset[,41]) == TRUE)
miss_ps_calc_02
## [1] 0
sd(dataset[,41])
## [1] 0.2871817
var(dataset[,41])
## [1] 0.08247332
skew_ps_calc_02 <- mean((dataset[,41] - mean(dataset[,41]))^3/sd(dataset[,41])^3)
skew_ps_calc_02
## [1] -6.740767e-05
kurtosys_ps_calc_02 <- mean((dataset[,41] - mean(dataset[,41]))^4/sd(dataset[,41])^4) - 3
kurtosys_ps_calc_02
## [1] -1.223397
boxplot.stats(dataset[,41])$out
## numeric(0)
summary(dataset[,42])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.20 0.50 0.45 0.70 0.90
miss_ps_calc_03 <- sum(is.na(dataset[,42]) == TRUE)
miss_ps_calc_03
## [1] 0
sd(dataset[,42])
## [1] 0.2872136
var(dataset[,42])
## [1] 0.08249165
skew_ps_calc_03 <- mean((dataset[,42] - mean(dataset[,42]))^3/sd(dataset[,42])^3)
skew_ps_calc_03
## [1] 0.0002636599
kurtosys_ps_calc_03 <- mean((dataset[,42] - mean(dataset[,42]))^4/sd(dataset[,42])^4) - 3
kurtosys_ps_calc_03
## [1] -1.224441
boxplot.stats(dataset[,42])$out
## numeric(0)
cat <- dataset[,-c(1,21:23,36:42)]
for (i in 1:48){
cat[,i] <- as.factor(cat[,i])
}
y <- cat[,1]
print(chisq.test(table(y,cat[,2])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 2])
## X-squared = 255.05, df = 7, p-value < 2.2e-16
print(chisq.test(table(y,cat[,3])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 3])
## X-squared = 23.04, df = 3, p-value = 3.962e-05
print(chisq.test(table(y,cat[,4])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 4])
## X-squared = 660.12, df = 11, p-value < 2.2e-16
print(chisq.test(table(y,cat[,5])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 5])
## X-squared = 61.149, df = 1, p-value = 5.291e-15
print(chisq.test(table(y,cat[,6])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 6])
## X-squared = 771.16, df = 6, p-value < 2.2e-16
print(chisq.test(table(y,cat[,7])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 7])
## X-squared = 688.38, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,8])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 8])
## X-squared = 696.5, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,9])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 9])
## X-squared = 102.69, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,10])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 10])
## X-squared = 40.269, df = 1, p-value = 2.213e-10
print(chisq.test(table(y,cat[,11])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 11])
## X-squared = 1.4908, df = 1, p-value = 0.2221
print(chisq.test(table(y,cat[,12])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 12])
## X-squared = 2.1921, df = 1, p-value = 0.1387
print(chisq.test(table(y,cat[,13])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 13])
## X-squared = 35.879, df = 1, p-value = 2.1e-09
print(chisq.test(table(y,cat[,14])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 14])
## X-squared = 3.1888, df = 1, p-value = 0.07415
print(chisq.test(table(y,cat[,15])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 15])
## X-squared = 38.016, df = 4, p-value = 1.112e-07
print(chisq.test(table(y,cat[,16])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 16])
## X-squared = 334.55, df = 13, p-value < 2.2e-16
print(chisq.test(table(y,cat[,17])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 17])
## X-squared = 458.97, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,18])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 18])
## X-squared = 816.56, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,19])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 19])
## X-squared = 12.285, df = 1, p-value = 0.0004567
print(chisq.test(table(y,cat[,20])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 20])
## X-squared = 770.78, df = 11, p-value < 2.2e-16
print(chisq.test(table(y,cat[,21])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 21])
## X-squared = 591.82, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,22])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 22])
## X-squared = 78.848, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,23])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 23])
## X-squared = 866.46, df = 9, p-value < 2.2e-16
print(chisq.test(table(y,cat[,24])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 24])
## X-squared = 0.24066, df = 1, p-value = 0.6237
print(chisq.test(table(y,cat[,25])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 25])
## X-squared = 831.49, df = 17, p-value < 2.2e-16
print(chisq.test(table(y,cat[,26])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 26])
## X-squared = 226.87, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,27])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 27])
## X-squared = 246.01, df = 1, p-value < 2.2e-16
print(chisq.test(table(y,cat[,28])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 28])
## X-squared = 420.39, df = 4, p-value < 2.2e-16
print(chisq.test(table(y,cat[,29])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 29])
## X-squared = 0.64897, df = 2, p-value = 0.7229
print(chisq.test(table(y,cat[,30])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 30])
## X-squared = 1490.1, df = 103, p-value < 2.2e-16
print(chisq.test(table(y,cat[,31])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 31])
## X-squared = 255.65, df = 3, p-value < 2.2e-16
print(chisq.test(table(y,cat[,32])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 32])
## X-squared = 3.0938, df = 5, p-value = 0.6855
print(chisq.test(table(y,cat[,33])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 33])
## X-squared = 6.9657, df = 6, p-value = 0.324
print(chisq.test(table(y,cat[,34])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 34])
## X-squared = 7.3787, df = 10, p-value = 0.6893
print(chisq.test(table(y,cat[,35])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 35])
## X-squared = 11.443, df = 9, p-value = 0.2466
print(chisq.test(table(y,cat[,36])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 36])
## X-squared = NaN, df = 11, p-value = NA
print(chisq.test(table(y,cat[,37])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 37])
## X-squared = 6.1507, df = 7, p-value = 0.5223
print(chisq.test(table(y,cat[,38])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 38])
## X-squared = 24.017, df = 25, p-value = 0.5184
print(chisq.test(table(y,cat[,39])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 39])
## X-squared = NaN, df = 20, p-value = NA
print(chisq.test(table(y,cat[,40])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 40])
## X-squared = NaN, df = 11, p-value = NA
print(chisq.test(table(y,cat[,41])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 41])
## X-squared = NaN, df = 15, p-value = NA
print(chisq.test(table(y,cat[,42])))
##
## Pearson's Chi-squared test
##
## data: table(y, cat[, 42])
## X-squared = NaN, df = 24, p-value = NA
print(chisq.test(table(y,cat[,43])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 43])
## X-squared = 0.13529, df = 1, p-value = 0.713
print(chisq.test(table(y,cat[,44])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 44])
## X-squared = 0.2248, df = 1, p-value = 0.6354
print(chisq.test(table(y,cat[,45])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 45])
## X-squared = 0.01545, df = 1, p-value = 0.9011
print(chisq.test(table(y,cat[,46])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 46])
## X-squared = 0.17519, df = 1, p-value = 0.6755
print(chisq.test(table(y,cat[,47])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 47])
## X-squared = 1.7905, df = 1, p-value = 0.1809
print(chisq.test(table(y,cat[,48])))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(y, cat[, 48])
## X-squared = 0.66851, df = 1, p-value = 0.4136
num <- dataset[,c(21:23,36:42)]
cornum <- cor(num,use="complete.obs",method = "spearman")
summary(cornum[upper.tri(cornum)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.0520355 0.0002341 0.0011868 0.1075220 0.1059585 0.6844952
corrplot(cornum, type="lower", tl.col = "black", diag=FALSE, method="number")
fit1 <- glm(y ~., data=num ,na.action = na.omit,family = binomial(logit))
par(mfrow=c(2,2))
plot(fit1)
Anova(fit1, test.statistic="Wald",type=3,singular.ok=TRUE)
## Analysis of Deviance Table (Type III tests)
##
## Response: y
## Df Chisq Pr(>Chisq)
## (Intercept) 1 1868.2474 < 2.2e-16 ***
## ps_reg_01 1 1.5207 0.2175
## ps_reg_02 1 21.2829 3.963e-06 ***
## ps_reg_03 1 21.5484 3.450e-06 ***
## ps_car_12 1 65.4737 5.890e-16 ***
## ps_car_13 1 186.6955 < 2.2e-16 ***
## ps_car_14 1 116.0678 < 2.2e-16 ***
## ps_car_15 1 41.7664 1.029e-10 ***
## ps_calc_01 1 0.7684 0.3807
## ps_calc_02 1 0.0299 0.8626
## ps_calc_03 1 1.7033 0.1919
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dataset$ps_ind_02_cat[which(is.na(dataset$ps_ind_02_cat))] = 1
dataset$ps_ind_04_cat[which(is.na(dataset$ps_ind_04_cat))] = 0
dataset$ps_ind_05_cat[which(is.na(dataset$ps_ind_05_cat))] = 0
dataset$ps_car_01_cat[which(is.na(dataset$ps_car_01_cat))] = 11
dataset$ps_car_02_cat[which(is.na(dataset$ps_car_02_cat))] = 1
dataset$ps_car_07_cat[which(is.na(dataset$ps_car_07_cat))] = 1
dataset$ps_car_09_cat[which(is.na(dataset$ps_car_09_cat))] = 2
dataset$ps_car_11[which(is.na(dataset$ps_car_11))] = 2
dataset$ps_car_12[which(is.na(dataset$ps_car_12))] = 0.3742
dataset$ps_car_14[which(is.na(dataset$ps_car_14))] = 0.37
train1 <- dataset[,c(2,21:23,36:42)]
train1 <- as.data.frame(train1)
dummies <- dummyVars(target ~ ., data = train2)
train.dummy <- predict(dummies, newdata = train2)
set.seed(2000)
pre.process <- preProcess(train.dummy, method = "bagImpute")
imputed.data <- predict(pre.process, train.dummy)
dataset$ps_reg_03 <- imputed.data[,2]
summary(dataset[,23])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06124 0.59634 0.75746 0.85617 1.00964 4.42352
dataset[,4] <- as.integer(dataset[,4])
dataset[,6] <- as.integer(dataset[,6])
dataset[,7] <- as.integer(dataset[,7])
dataset[,24] <- as.integer(dataset[,24])
dataset[,25] <- as.integer(dataset[,25])
dataset[,30] <- as.integer(dataset[,30])
dataset[,32] <- as.integer(dataset[,32])
dataset[,35] <- as.integer(dataset[,35])
dataset <- dataset[,-c(1,26,28)]
cor <- cor(dataset,use="complete.obs",method = "spearman")
summary(cor[upper.tri(cor)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5996363 -0.0013436 0.0001581 0.0046242 0.0020814 0.9378869
highcor <- sum(abs(cor[upper.tri(cor)]) > .75)
highcor
## [1] 1
corrplot(cor, type="lower", tl.col = "black", diag=FALSE, method="number")
dataset <- dataset[,-15]
nzv <- nearZeroVar(dataset, saveMetrics= TRUE)
nzv[nzv$nzv,][1:15,]
## freqRatio percentUnique zeroVar nzv
## target 26.43671 0.0001344061 FALSE TRUE
## ps_ind_05_cat 25.71334 0.0004704213 FALSE TRUE
## ps_ind_10_bin 2680.13153 0.0001344061 FALSE TRUE
## ps_ind_11_bin 611.10531 0.0001344061 FALSE TRUE
## ps_ind_12_bin 105.37129 0.0001344061 FALSE TRUE
## ps_ind_13_bin 996.33780 0.0001344061 FALSE TRUE
## ps_car_10_cat 121.57713 0.0002016091 FALSE TRUE
## NA NA NA NA NA
## NA.1 NA NA NA NA
## NA.2 NA NA NA NA
## NA.3 NA NA NA NA
## NA.4 NA NA NA NA
## NA.5 NA NA NA NA
## NA.6 NA NA NA NA
## NA.7 NA NA NA NA
nzv <- nearZeroVar(dataset)
dataset <- dataset[, -nzv[-1]]
dim(dataset)
## [1] 1488028 49
str(dataset)
## 'data.frame': 1488028 obs. of 49 variables:
## $ target : int 0 0 0 0 0 0 0 0 0 1 ...
## $ ps_ind_01 : int 2 1 5 0 0 5 2 5 5 1 ...
## $ ps_ind_02_cat : int 2 1 4 1 2 1 1 1 1 1 ...
## $ ps_ind_03 : int 5 7 9 2 0 4 3 4 3 2 ...
## $ ps_ind_04_cat : int 1 0 1 0 1 0 1 0 1 0 ...
## $ ps_ind_06_bin : int 0 0 0 1 1 0 0 1 0 0 ...
## $ ps_ind_07_bin : int 1 0 0 0 0 0 1 0 0 1 ...
## $ ps_ind_08_bin : int 0 1 1 0 0 0 0 0 1 0 ...
## $ ps_ind_09_bin : int 0 0 0 0 0 1 0 0 0 0 ...
## $ ps_ind_15 : int 11 3 12 8 9 6 8 13 6 4 ...
## $ ps_ind_16_bin : int 0 0 1 1 1 1 1 1 1 0 ...
## $ ps_ind_17_bin : int 1 0 0 0 0 0 0 0 0 0 ...
## $ ps_ind_18_bin : int 0 1 0 0 0 0 0 0 0 1 ...
## $ ps_reg_01 : num 0.7 0.8 0 0.9 0.7 0.9 0.6 0.7 0.9 0.9 ...
## $ ps_reg_02 : num 0.2 0.4 0 0.2 0.6 1.8 0.1 0.4 0.7 1.4 ...
## $ ps_reg_03 : num 0.718 0.766 0.775 0.581 0.841 ...
## $ ps_car_01_cat : int 10 11 7 7 11 10 6 11 10 11 ...
## $ ps_car_02_cat : int 1 1 1 1 1 0 1 1 1 0 ...
## $ ps_car_04_cat : int 0 0 0 0 0 0 0 0 0 1 ...
## $ ps_car_06_cat : int 4 11 14 11 14 14 11 11 14 14 ...
## $ ps_car_07_cat : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_08_cat : int 0 1 1 1 1 1 1 1 1 1 ...
## $ ps_car_09_cat : int 0 2 2 3 2 0 0 2 0 2 ...
## $ ps_car_11_cat : int 12 19 60 104 82 104 99 30 68 104 ...
## $ ps_car_11 : int 2 3 1 1 3 2 2 3 3 2 ...
## $ ps_car_12 : num 0.4 0.316 0.316 0.374 0.316 ...
## $ ps_car_13 : num 0.884 0.619 0.642 0.543 0.566 ...
## $ ps_car_14 : num 0.371 0.389 0.347 0.295 0.365 ...
## $ ps_car_15 : num 3.61 2.45 3.32 2 2 ...
## $ ps_calc_01 : num 0.6 0.3 0.5 0.6 0.4 0.7 0.2 0.1 0.9 0.7 ...
## $ ps_calc_02 : num 0.5 0.1 0.7 0.9 0.6 0.8 0.6 0.5 0.8 0.8 ...
## $ ps_calc_03 : num 0.2 0.3 0.1 0.1 0 0.4 0.5 0.1 0.6 0.8 ...
## $ ps_calc_04 : int 3 2 2 2 2 3 2 1 3 2 ...
## $ ps_calc_05 : int 1 1 2 4 2 1 2 2 1 2 ...
## $ ps_calc_06 : int 10 9 9 7 6 8 8 7 7 8 ...
## $ ps_calc_07 : int 1 5 1 1 3 2 1 1 3 2 ...
## $ ps_calc_08 : int 10 8 8 8 10 11 8 6 9 9 ...
## $ ps_calc_09 : int 1 1 2 4 2 3 3 1 4 1 ...
## $ ps_calc_10 : int 5 7 7 2 12 8 10 13 11 11 ...
## $ ps_calc_11 : int 9 3 4 2 3 4 3 7 4 3 ...
## $ ps_calc_12 : int 1 1 2 2 1 2 0 1 2 5 ...
## $ ps_calc_13 : int 5 1 7 4 1 0 0 3 1 0 ...
## $ ps_calc_14 : int 8 9 7 9 3 9 10 6 5 6 ...
## $ ps_calc_15_bin: int 0 0 0 0 0 0 0 1 0 0 ...
## $ ps_calc_16_bin: int 1 1 1 0 0 1 1 0 1 1 ...
## $ ps_calc_17_bin: int 1 1 1 0 0 0 0 1 0 0 ...
## $ ps_calc_18_bin: int 0 0 0 0 1 1 0 0 0 0 ...
## $ ps_calc_19_bin: int 0 1 1 0 1 1 1 1 0 1 ...
## $ ps_calc_20_bin: int 1 0 0 0 0 1 0 0 1 0 ...
training <- dataset[1:595212,]
comboInfo <- findLinearCombos(training)
comboInfo
## $linearCombos
## list()
##
## $remove
## NULL
write.csv(dataset, "C:/Users/user/Desktop/Kaggle/Porto Seguro/dataset.csv", quote=F, na="", row.names=F)