library(dplyr)
library(knitr)
library(tibble)
library(data.table)
library(corrplot)
library(VIM)
# load data
test <- as.tibble(fread('test.csv', na.strings=c('-1','-1.0')))
##
Read 0.0% of 892816 rows
Read 11.2% of 892816 rows
Read 22.4% of 892816 rows
Read 32.5% of 892816 rows
Read 43.7% of 892816 rows
Read 53.8% of 892816 rows
Read 63.8% of 892816 rows
Read 73.9% of 892816 rows
Read 85.1% of 892816 rows
Read 96.3% of 892816 rows
Read 892816 rows and 58 (of 58) columns from 0.160 GB file in 00:00:13
train <- as.tibble(fread('train.csv', na.strings=c('-1','-1.0')))
##
Read 15.1% of 595212 rows
Read 31.9% of 595212 rows
Read 48.7% of 595212 rows
Read 65.5% of 595212 rows
Read 82.3% of 595212 rows
Read 99.1% of 595212 rows
Read 595212 rows and 59 (of 59) columns from 0.108 GB file in 00:00:08
sample_submit <- as.tibble(fread('sample_submission.csv'))
Can a machine learning model be a better predictor of annual auto insurance claims than basic statistics?
The cases would be the driver
**https://www.kaggle.com/c/porto-seguro-safe-driver-prediction**
This is an obervational study
The response variable is the insurance claim
The explanatory variables are all other variables except the response variables. There is a combination of numerical and categorical
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
summary(test)
## id ps_ind_01 ps_ind_02_cat ps_ind_03
## Min. : 0 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 372022 1st Qu.:0.000 1st Qu.:1.000 1st Qu.: 2.000
## Median : 744307 Median :1.000 Median :1.000 Median : 4.000
## Mean : 744154 Mean :1.902 Mean :1.359 Mean : 4.414
## 3rd Qu.:1116309 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.: 6.000
## Max. :1488026 Max. :7.000 Max. :4.000 Max. :11.000
## NA's :307
## ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin ps_ind_07_bin
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.4176 Mean :0.422 Mean :0.3932 Mean :0.2572
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.000 Max. :1.0000 Max. :1.0000
## NA's :145 NA's :8710
## ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin ps_ind_11_bin
## Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.0000 Median :0.0000 Median :0.000000 Median :0.000000
## Mean :0.1637 Mean :0.1859 Mean :0.000373 Mean :0.001595
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.000000 Max. :1.000000
##
## ps_ind_12_bin ps_ind_13_bin ps_ind_14 ps_ind_15
## Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.: 5.000
## Median :0.000000 Median :0.000000 Median :0.00000 Median : 7.000
## Mean :0.009376 Mean :0.001039 Mean :0.01238 Mean : 7.297
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:10.000
## Max. :1.000000 Max. :1.000000 Max. :4.00000 Max. :13.000
##
## ps_ind_16_bin ps_ind_17_bin ps_ind_18_bin ps_reg_01
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.4000
## Median :1.0000 Median :0.0000 Median :0.000 Median :0.7000
## Mean :0.6606 Mean :0.1204 Mean :0.155 Mean :0.6111
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.9000
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :0.9000
##
## ps_reg_02 ps_reg_03 ps_car_01_cat ps_car_02_cat
## Min. :0.0000 Min. :0.06 Min. : 0.000 Min. :0.00
## 1st Qu.:0.2000 1st Qu.:0.63 1st Qu.: 7.000 1st Qu.:1.00
## Median :0.3000 Median :0.80 Median : 7.000 Median :1.00
## Mean :0.4399 Mean :0.89 Mean : 8.294 Mean :0.83
## 3rd Qu.:0.6000 3rd Qu.:1.09 3rd Qu.:11.000 3rd Qu.:1.00
## Max. :1.8000 Max. :4.42 Max. :11.000 Max. :1.00
## NA's :161684 NA's :160 NA's :5
## ps_car_03_cat ps_car_04_cat ps_car_05_cat ps_car_06_cat
## Min. :0.0 Min. :0.0000 Min. :0.0 Min. : 0.000
## 1st Qu.:0.0 1st Qu.:0.0000 1st Qu.:0.0 1st Qu.: 1.000
## Median :1.0 Median :0.0000 Median :1.0 Median : 7.000
## Mean :0.6 Mean :0.7258 Mean :0.5 Mean : 6.564
## 3rd Qu.:1.0 3rd Qu.:0.0000 3rd Qu.:1.0 3rd Qu.:11.000
## Max. :1.0 Max. :9.0000 Max. :1.0 Max. :17.000
## NA's :616911 NA's :400359
## ps_car_07_cat ps_car_08_cat ps_car_09_cat ps_car_10_cat
## Min. :0.000 Min. :0.0000 Min. :0.00 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:0.00 1st Qu.:1.0000
## Median :1.000 Median :1.0000 Median :2.00 Median :1.0000
## Mean :0.948 Mean :0.8323 Mean :1.33 Mean :0.9921
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:2.00 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :4.00 Max. :2.0000
## NA's :17331 NA's :877
## ps_car_11_cat ps_car_11 ps_car_12 ps_car_13
## Min. : 1.00 Min. :0.000 Min. :0.1414 Min. :0.2758
## 1st Qu.: 32.00 1st Qu.:2.000 1st Qu.:0.3162 1st Qu.:0.6712
## Median : 65.00 Median :3.000 Median :0.3742 Median :0.7661
## Mean : 62.28 Mean :2.347 Mean :0.3800 Mean :0.8136
## 3rd Qu.: 94.00 3rd Qu.:3.000 3rd Qu.:0.4000 3rd Qu.:0.9061
## Max. :104.00 Max. :3.000 Max. :1.2649 Max. :4.0313
## NA's :1
## ps_car_14 ps_car_15 ps_calc_01 ps_calc_02
## Min. :0.11 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.35 1st Qu.:2.828 1st Qu.:0.2000 1st Qu.:0.2000
## Median :0.37 Median :3.317 Median :0.4000 Median :0.5000
## Mean :0.37 Mean :3.068 Mean :0.4496 Mean :0.4505
## 3rd Qu.:0.40 3rd Qu.:3.606 3rd Qu.:0.7000 3rd Qu.:0.7000
## Max. :0.64 Max. :3.742 Max. :0.9000 Max. :0.9000
## NA's :63805
## ps_calc_03 ps_calc_04 ps_calc_05 ps_calc_06
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. : 1.000
## 1st Qu.:0.2000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 7.000
## Median :0.4000 Median :2.000 Median :2.000 Median : 8.000
## Mean :0.4501 Mean :2.371 Mean :1.885 Mean : 7.688
## 3rd Qu.:0.7000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :0.9000 Max. :5.000 Max. :6.000 Max. :10.000
##
## ps_calc_07 ps_calc_08 ps_calc_09 ps_calc_10
## Min. :0.00 Min. : 1.000 Min. :0.000 Min. : 0.000
## 1st Qu.:2.00 1st Qu.: 8.000 1st Qu.:1.000 1st Qu.: 6.000
## Median :3.00 Median : 9.000 Median :2.000 Median : 8.000
## Mean :3.01 Mean : 9.226 Mean :2.339 Mean : 8.443
## 3rd Qu.:4.00 3rd Qu.:10.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :9.00 Max. :12.000 Max. :7.000 Max. :25.000
##
## ps_calc_11 ps_calc_12 ps_calc_13 ps_calc_14
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.000 1st Qu.: 1.00 1st Qu.: 2.000 1st Qu.: 6.00
## Median : 5.000 Median : 1.00 Median : 3.000 Median : 7.00
## Mean : 5.438 Mean : 1.44 Mean : 2.875 Mean : 7.54
## 3rd Qu.: 7.000 3rd Qu.: 2.00 3rd Qu.: 4.000 3rd Qu.: 9.00
## Max. :20.000 Max. :11.00 Max. :15.000 Max. :28.00
##
## ps_calc_15_bin ps_calc_16_bin ps_calc_17_bin ps_calc_18_bin
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.1237 Mean :0.6278 Mean :0.5547 Mean :0.2878
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## ps_calc_19_bin ps_calc_20_bin
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.3493 Mean :0.1524
## 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
##
summary(train)
## id target ps_ind_01 ps_ind_02_cat
## Min. : 7 Min. :0.00000 Min. :0.0 Min. :1.00
## 1st Qu.: 371992 1st Qu.:0.00000 1st Qu.:0.0 1st Qu.:1.00
## Median : 743548 Median :0.00000 Median :1.0 Median :1.00
## Mean : 743804 Mean :0.03645 Mean :1.9 Mean :1.36
## 3rd Qu.:1115549 3rd Qu.:0.00000 3rd Qu.:3.0 3rd Qu.:2.00
## Max. :1488027 Max. :1.00000 Max. :7.0 Max. :4.00
## NA's :216
## ps_ind_03 ps_ind_04_cat ps_ind_05_cat ps_ind_06_bin
## Min. : 0.000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000
## Median : 4.000 Median :0.000 Median :0.000 Median :0.0000
## Mean : 4.423 Mean :0.417 Mean :0.419 Mean :0.3937
## 3rd Qu.: 6.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :11.000 Max. :1.000 Max. :6.000 Max. :1.0000
## NA's :83 NA's :5809
## ps_ind_07_bin ps_ind_08_bin ps_ind_09_bin ps_ind_10_bin
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000
## Median :0.000 Median :0.0000 Median :0.0000 Median :0.000000
## Mean :0.257 Mean :0.1639 Mean :0.1853 Mean :0.000373
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.000000
##
## ps_ind_11_bin ps_ind_12_bin ps_ind_13_bin
## Min. :0.000000 Min. :0.000000 Min. :0.0000000
## 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.0000000
## Median :0.000000 Median :0.000000 Median :0.0000000
## Mean :0.001692 Mean :0.009439 Mean :0.0009476
## 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.0000000
## Max. :1.000000 Max. :1.000000 Max. :1.0000000
##
## ps_ind_14 ps_ind_15 ps_ind_16_bin ps_ind_17_bin
## Min. :0.00000 Min. : 0.0 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 5.0 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median : 7.0 Median :1.0000 Median :0.0000
## Mean :0.01245 Mean : 7.3 Mean :0.6608 Mean :0.1211
## 3rd Qu.:0.00000 3rd Qu.:10.0 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :4.00000 Max. :13.0 Max. :1.0000 Max. :1.0000
##
## ps_ind_18_bin ps_reg_01 ps_reg_02 ps_reg_03
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.06
## 1st Qu.:0.0000 1st Qu.:0.400 1st Qu.:0.2000 1st Qu.:0.63
## Median :0.0000 Median :0.700 Median :0.3000 Median :0.80
## Mean :0.1534 Mean :0.611 Mean :0.4392 Mean :0.89
## 3rd Qu.:0.0000 3rd Qu.:0.900 3rd Qu.:0.6000 3rd Qu.:1.08
## Max. :1.0000 Max. :0.900 Max. :1.8000 Max. :4.04
## NA's :107772
## ps_car_01_cat ps_car_02_cat ps_car_03_cat ps_car_04_cat
## Min. : 0.000 Min. :0.0000 Min. :0.0 Min. :0.0000
## 1st Qu.: 7.000 1st Qu.:1.0000 1st Qu.:0.0 1st Qu.:0.0000
## Median : 7.000 Median :1.0000 Median :1.0 Median :0.0000
## Mean : 8.298 Mean :0.8299 Mean :0.6 Mean :0.7252
## 3rd Qu.:11.000 3rd Qu.:1.0000 3rd Qu.:1.0 3rd Qu.:0.0000
## Max. :11.000 Max. :1.0000 Max. :1.0 Max. :9.0000
## NA's :107 NA's :5 NA's :411231
## ps_car_05_cat ps_car_06_cat ps_car_07_cat ps_car_08_cat
## Min. :0.00 Min. : 0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:1.0000
## Median :1.00 Median : 7.000 Median :1.000 Median :1.0000
## Mean :0.53 Mean : 6.555 Mean :0.948 Mean :0.8321
## 3rd Qu.:1.00 3rd Qu.:11.000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.00 Max. :17.000 Max. :1.000 Max. :1.0000
## NA's :266551 NA's :11489
## ps_car_09_cat ps_car_10_cat ps_car_11_cat ps_car_11
## Min. :0.000 Min. :0.0000 Min. : 1.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:1.0000 1st Qu.: 32.00 1st Qu.:2.000
## Median :2.000 Median :1.0000 Median : 65.00 Median :3.000
## Mean :1.331 Mean :0.9921 Mean : 62.22 Mean :2.346
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 93.00 3rd Qu.:3.000
## Max. :4.000 Max. :2.0000 Max. :104.00 Max. :3.000
## NA's :569 NA's :5
## ps_car_12 ps_car_13 ps_car_14 ps_car_15
## Min. :0.1000 Min. :0.2506 Min. :0.11 Min. :0.000
## 1st Qu.:0.3162 1st Qu.:0.6709 1st Qu.:0.35 1st Qu.:2.828
## Median :0.3742 Median :0.7658 Median :0.37 Median :3.317
## Mean :0.3799 Mean :0.8133 Mean :0.37 Mean :3.066
## 3rd Qu.:0.4000 3rd Qu.:0.9062 3rd Qu.:0.40 3rd Qu.:3.606
## Max. :1.2649 Max. :3.7206 Max. :0.64 Max. :3.742
## NA's :1 NA's :42620
## ps_calc_01 ps_calc_02 ps_calc_03 ps_calc_04
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.2000 1st Qu.:0.2000 1st Qu.:0.2000 1st Qu.:2.000
## Median :0.5000 Median :0.4000 Median :0.5000 Median :2.000
## Mean :0.4498 Mean :0.4496 Mean :0.4498 Mean :2.372
## 3rd Qu.:0.7000 3rd Qu.:0.7000 3rd Qu.:0.7000 3rd Qu.:3.000
## Max. :0.9000 Max. :0.9000 Max. :0.9000 Max. :5.000
##
## ps_calc_05 ps_calc_06 ps_calc_07 ps_calc_08
## Min. :0.000 Min. : 0.000 Min. :0.000 Min. : 2.000
## 1st Qu.:1.000 1st Qu.: 7.000 1st Qu.:2.000 1st Qu.: 8.000
## Median :2.000 Median : 8.000 Median :3.000 Median : 9.000
## Mean :1.886 Mean : 7.689 Mean :3.006 Mean : 9.226
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.:4.000 3rd Qu.:10.000
## Max. :6.000 Max. :10.000 Max. :9.000 Max. :12.000
##
## ps_calc_09 ps_calc_10 ps_calc_11 ps_calc_12
## Min. :0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.: 6.000 1st Qu.: 4.000 1st Qu.: 1.000
## Median :2.000 Median : 8.000 Median : 5.000 Median : 1.000
## Mean :2.339 Mean : 8.434 Mean : 5.441 Mean : 1.442
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 2.000
## Max. :7.000 Max. :25.000 Max. :19.000 Max. :10.000
##
## ps_calc_13 ps_calc_14 ps_calc_15_bin ps_calc_16_bin
## Min. : 0.000 Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 2.000 1st Qu.: 6.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 3.000 Median : 7.000 Median :0.0000 Median :1.0000
## Mean : 2.872 Mean : 7.539 Mean :0.1224 Mean :0.6278
## 3rd Qu.: 4.000 3rd Qu.: 9.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :13.000 Max. :23.000 Max. :1.0000 Max. :1.0000
##
## ps_calc_17_bin ps_calc_18_bin ps_calc_19_bin ps_calc_20_bin
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.000 Median :0.0000
## Mean :0.5542 Mean :0.2872 Mean :0.349 Mean :0.1533
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.0000
##
# count the missing values
sum(is.na(test))
## [1] 1270295
sum(is.na(train))
## [1] 846458
train %>%
select(which(colMeans(is.na(.)) > 0)) %>%
aggr(prop = FALSE, combined = TRUE, numbers = TRUE, bars = FALSE, cex.axis = 0.7)
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
** ps_car_03 and ps_car_05 have the largest number of NAs.**
train %>%
mutate_at(vars(ends_with("cat")), funs(as.integer)) %>%
mutate_at(vars(ends_with("bin")), funs(as.integer)) %>%
mutate(target = as.integer(target)) %>%
cor(use="complete.obs") %>%
corrplot(type="lower", tl.col = "black", diag=FALSE)
This correlation plot compares the correlations between independent variables to see whether two variables will move together as one changes. This can be used as a starting point for analysis