DATA606-Final Project

Part 3 - Exploratory data analysis:

library(tidyverse)
library(caret)
library(verification)
library(repr)
library(ROCR)
library(dplyr)

dtrain <- read_csv('train.csv', col_types = cols())
glimpse(dtrain)

## Observations: 595,212
## Variables: 59
## $ id             <int> 7, 9, 13, 16, 17, 19, 20, 22, 26, 28, 34, 35, 3...
## $ target         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_01      <int> 2, 1, 5, 0, 0, 5, 2, 5, 5, 1, 5, 2, 2, 1, 5, 5,...
## $ ps_ind_02_cat  <int> 2, 1, 4, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,...
## $ ps_ind_03      <int> 5, 7, 9, 2, 0, 4, 3, 4, 3, 2, 2, 3, 1, 3, 11, 3...
## $ ps_ind_04_cat  <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,...
## $ ps_ind_05_cat  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_06_bin  <int> 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_07_bin  <int> 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,...
## $ ps_ind_08_bin  <int> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,...
## $ ps_ind_09_bin  <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ ps_ind_10_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_11_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_12_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_13_bin  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_14      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_15      <int> 11, 3, 12, 8, 9, 6, 8, 13, 6, 4, 3, 9, 10, 12, ...
## $ ps_ind_16_bin  <int> 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,...
## $ ps_ind_17_bin  <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_ind_18_bin  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,...
## $ ps_reg_01      <dbl> 0.7, 0.8, 0.0, 0.9, 0.7, 0.9, 0.6, 0.7, 0.9, 0....
## $ ps_reg_02      <dbl> 0.2, 0.4, 0.0, 0.2, 0.6, 1.8, 0.1, 0.4, 0.7, 1....
## $ ps_reg_03      <dbl> 0.7180703, 0.7660777, -1.0000000, 0.5809475, 0....
## $ ps_car_01_cat  <int> 10, 11, 7, 7, 11, 10, 6, 11, 10, 11, 11, 11, 6,...
## $ ps_car_02_cat  <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,...
## $ ps_car_03_cat  <int> -1, -1, -1, 0, -1, -1, -1, 0, -1, 0, -1, -1, -1...
## $ ps_car_04_cat  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 0, 0, 0, 9,...
## $ ps_car_05_cat  <int> 1, -1, -1, 1, -1, 0, 1, 0, 1, 0, -1, -1, -1, 1,...
## $ ps_car_06_cat  <int> 4, 11, 14, 11, 14, 14, 11, 11, 14, 14, 13, 11, ...
## $ ps_car_07_cat  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ ps_car_08_cat  <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,...
## $ ps_car_09_cat  <int> 0, 2, 2, 3, 2, 0, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0,...
## $ ps_car_10_cat  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ ps_car_11_cat  <int> 12, 19, 60, 104, 82, 104, 99, 30, 68, 104, 20, ...
## $ ps_car_11      <int> 2, 3, 1, 1, 3, 2, 2, 3, 3, 2, 3, 3, 3, 3, 1, 2,...
## $ ps_car_12      <dbl> 0.4000000, 0.3162278, 0.3162278, 0.3741657, 0.3...
## $ ps_car_13      <dbl> 0.8836789, 0.6188165, 0.6415857, 0.5429488, 0.5...
## $ ps_car_14      <dbl> 0.3708099, 0.3887158, 0.3472751, 0.2949576, 0.3...
## $ ps_car_15      <dbl> 3.605551, 2.449490, 3.316625, 2.000000, 2.00000...
## $ ps_calc_01     <dbl> 0.6, 0.3, 0.5, 0.6, 0.4, 0.7, 0.2, 0.1, 0.9, 0....
## $ ps_calc_02     <dbl> 0.5, 0.1, 0.7, 0.9, 0.6, 0.8, 0.6, 0.5, 0.8, 0....
## $ ps_calc_03     <dbl> 0.2, 0.3, 0.1, 0.1, 0.0, 0.4, 0.5, 0.1, 0.6, 0....
## $ ps_calc_04     <int> 3, 2, 2, 2, 2, 3, 2, 1, 3, 2, 2, 2, 4, 2, 3, 2,...
## $ ps_calc_05     <int> 1, 1, 2, 4, 2, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 1,...
## $ ps_calc_06     <int> 10, 9, 9, 7, 6, 8, 8, 7, 7, 8, 8, 8, 8, 10, 8, ...
## $ ps_calc_07     <int> 1, 5, 1, 1, 3, 2, 1, 1, 3, 2, 2, 2, 4, 1, 2, 5,...
## $ ps_calc_08     <int> 10, 8, 8, 8, 10, 11, 8, 6, 9, 9, 9, 10, 11, 8, ...
## $ ps_calc_09     <int> 1, 1, 2, 4, 2, 3, 3, 1, 4, 1, 4, 1, 1, 3, 3, 2,...
## $ ps_calc_10     <int> 5, 7, 7, 2, 12, 8, 10, 13, 11, 11, 7, 8, 9, 8, ...
## $ ps_calc_11     <int> 9, 3, 4, 2, 3, 4, 3, 7, 4, 3, 6, 9, 6, 2, 4, 5,...
## $ ps_calc_12     <int> 1, 1, 2, 2, 1, 2, 0, 1, 2, 5, 3, 2, 3, 0, 1, 2,...
## $ ps_calc_13     <int> 5, 1, 7, 4, 1, 0, 0, 3, 1, 0, 3, 1, 3, 4, 3, 6,...
## $ ps_calc_14     <int> 8, 9, 7, 9, 3, 9, 10, 6, 5, 6, 6, 10, 8, 3, 9, ...
## $ ps_calc_15_bin <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ps_calc_16_bin <int> 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,...
## $ ps_calc_17_bin <int> 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,...
## $ ps_calc_18_bin <int> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ ps_calc_19_bin <int> 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,...
## $ ps_calc_20_bin <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,...

There are just over 595,000 observations and 59 columns. The target column is the dependent variable, where a 1 would mean a customer filed a claim and 0 if they did not.

We will first take a look at how many claims have been filed from this dataset.

ggplot(dtrain,aes(x=target))+geom_bar()

dtrain %>%  
  group_by(target) %>% 
  summarise(n())

## # A tibble: 2 x 2
##   target  `n()`
##    <int>  <int>
## 1      0 573518
## 2      1  21694

## [1] "We can see that 3.78 % of the customers in this dataset filed a claim"

The data are anonymized, but we do know variables that end in _cat are categorical and those ending in _bin are binary variables. Everything else is considered to be continuous. Additionally, missing data in this set are written as -1 instead of NA, which we will need to fix so that non-available data isn’t misconstrued as an additional factor.

dtrain[dtrain == -1] <- NA
sum(is.na(dtrain))

## [1] 846458

As we saw in the glimpse of the dataset, the “_cat" variables are integers and need to be converted from integers to factors.

cat_variables <- names(dtrain)[grep('_cat$', names(dtrain))]

dtrain <- dtrain %>%
  mutate_at(.vars = cat_variables, .funs = as.factor)

Next we will want to perform dummy variable encoding

dtrain <- model.matrix(~ . - 1, data = dtrain)

From here we will split the data into a training and test set. The data are split with 80% of the observations used for training and 20% for testing. We will create a sample index and name the testing and training variables based on it.

set.seed(1)

training_slice <- sample(c(TRUE, FALSE), replace = TRUE, size = nrow(dtrain), prob = c(0.8, 0.2))

training <- as.data.frame(dtrain[training_slice, ])
testing <- as.data.frame(dtrain[!training_slice, ])

Here we use glm (general linear model) to fit our model and obtain a summary.

model <- glm(target ~ . - id, data = training, family = binomial(link = 'logit'))

summary(model)

## 
## Call:
## glm(formula = target ~ . - id, family = binomial(link = "logit"), 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8129  -0.3320  -0.2806  -0.2371   2.9660  
## 
## Coefficients: (6 not defined because of singularities)
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.3898534  0.7845220  -5.596 2.20e-08 ***
## ps_ind_01         0.0113356  0.0084175   1.347 0.178088    
## ps_ind_02_cat1   -0.1174078  0.0977659  -1.201 0.229787    
## ps_ind_02_cat2   -0.0544071  0.1061354  -0.513 0.608217    
## ps_ind_02_cat3   -0.2841776  0.1167366  -2.434 0.014919 *  
## ps_ind_02_cat4           NA         NA      NA       NA    
## ps_ind_03         0.0313411  0.0067343   4.654 3.26e-06 ***
## ps_ind_04_cat1    0.0640757  0.0361152   1.774 0.076030 .  
## ps_ind_05_cat1    0.2930870  0.0949132   3.088 0.002015 ** 
## ps_ind_05_cat2    0.6049128  0.1387648   4.359 1.30e-05 ***
## ps_ind_05_cat3    0.2991693  0.0967573   3.092 0.001988 ** 
## ps_ind_05_cat4    0.5506171  0.0724826   7.597 3.04e-14 ***
## ps_ind_05_cat5    0.1127887  0.2215282   0.509 0.610655    
## ps_ind_05_cat6    0.5571524  0.0762915   7.303 2.82e-13 ***
## ps_ind_06_bin     0.0455364  0.0568203   0.801 0.422893    
## ps_ind_07_bin     0.2817798  0.0551944   5.105 3.30e-07 ***
## ps_ind_08_bin     0.2770787  0.0568122   4.877 1.08e-06 ***
## ps_ind_09_bin            NA         NA      NA       NA    
## ps_ind_10_bin    -0.1774701  0.4258633  -0.417 0.676876    
## ps_ind_11_bin    -0.0526750  0.2096442  -0.251 0.801614    
## ps_ind_12_bin     0.0220820  0.0904480   0.244 0.807122    
## ps_ind_13_bin     0.1549626  0.2526340   0.613 0.539620    
## ps_ind_14                NA         NA      NA       NA    
## ps_ind_15        -0.0239200  0.0050765  -4.712 2.45e-06 ***
## ps_ind_16_bin    -0.1016086  0.0693221  -1.466 0.142717    
## ps_ind_17_bin     0.2903996  0.0726213   3.999 6.37e-05 ***
## ps_ind_18_bin    -0.1626351  0.0834294  -1.949 0.051251 .  
## ps_reg_01         0.0026723  0.0780318   0.034 0.972680    
## ps_reg_02         0.1253423  0.0539190   2.325 0.020091 *  
## ps_reg_03         0.0049855  0.0636912   0.078 0.937608    
## ps_car_01_cat1   -0.0346065  0.5507918  -0.063 0.949902    
## ps_car_01_cat2    0.0251952  0.8937756   0.028 0.977511    
## ps_car_01_cat3   -0.0884976  0.5382942  -0.164 0.869413    
## ps_car_01_cat4    0.1422559  0.5530641   0.257 0.797014    
## ps_car_01_cat5   -0.3141870  0.5240242  -0.600 0.548796    
## ps_car_01_cat6    0.1046227  0.5565131   0.188 0.850879    
## ps_car_01_cat7   -0.3878858  0.5208658  -0.745 0.456457    
## ps_car_01_cat8    0.1109351  0.5393373   0.206 0.837035    
## ps_car_01_cat9   -0.0911257  0.5216127  -0.175 0.861315    
## ps_car_01_cat10  -0.2267563  0.5449189  -0.416 0.677316    
## ps_car_01_cat11  -0.1852745  0.5200271  -0.356 0.721632    
## ps_car_02_cat1    0.0230586  0.0551110   0.418 0.675653    
## ps_car_03_cat1    0.1105964  0.0385513   2.869 0.004120 ** 
## ps_car_04_cat1   -0.3056217  0.1228819  -2.487 0.012878 *  
## ps_car_04_cat2    0.4420886  0.1804603   2.450 0.014294 *  
## ps_car_04_cat3   -0.1874058  0.2767781  -0.677 0.498344    
## ps_car_04_cat4   -0.6449051  1.0257523  -0.629 0.529536    
## ps_car_04_cat5   -0.3875273  0.3986238  -0.972 0.330969    
## ps_car_04_cat6   -0.2045278  0.2751129  -0.743 0.457220    
## ps_car_04_cat7    0.2315697  0.7619380   0.304 0.761187    
## ps_car_04_cat8   -0.1733983  0.1983141  -0.874 0.381921    
## ps_car_04_cat9   -0.5824057  0.1630168  -3.573 0.000353 ***
## ps_car_05_cat1   -0.0385359  0.0329291  -1.170 0.241894    
## ps_car_06_cat1    0.0979376  0.1914027   0.512 0.608873    
## ps_car_06_cat2    0.0543488  0.3183853   0.171 0.864459    
## ps_car_06_cat3    0.1425461  0.2055334   0.694 0.487969    
## ps_car_06_cat4   -0.0512703  0.2037808  -0.252 0.801354    
## ps_car_06_cat5    0.4972787  0.2608622   1.906 0.056613 .  
## ps_car_06_cat6   -0.0464541  0.2300336  -0.202 0.839960    
## ps_car_06_cat7    0.1702680  0.1900271   0.896 0.370242    
## ps_car_06_cat8    0.1509570  0.3473356   0.435 0.663842    
## ps_car_06_cat9    0.3565190  0.1676467   2.127 0.033453 *  
## ps_car_06_cat10   0.5813739  0.2836338   2.050 0.040390 *  
## ps_car_06_cat11   0.2274754  0.1792894   1.269 0.204526    
## ps_car_06_cat12   0.7108432  0.2291448   3.102 0.001921 ** 
## ps_car_06_cat13   0.1293143  0.2271814   0.569 0.569213    
## ps_car_06_cat14   0.0970884  0.1768086   0.549 0.582926    
## ps_car_06_cat15   0.4895243  0.2203963   2.221 0.026344 *  
## ps_car_06_cat16   0.5523301  0.1884853   2.930 0.003386 ** 
## ps_car_06_cat17   0.4373896  0.1787195   2.447 0.014391 *  
## ps_car_07_cat1   -0.2716495  0.0634814  -4.279 1.88e-05 ***
## ps_car_08_cat1    0.0533716  0.0478072   1.116 0.264254    
## ps_car_09_cat1    0.3797259  0.0758362   5.007 5.52e-07 ***
## ps_car_09_cat2    0.1377510  0.0592459   2.325 0.020068 *  
## ps_car_09_cat3    0.2501435  0.0960515   2.604 0.009207 ** 
## ps_car_09_cat4    0.3973249  0.2947207   1.348 0.177613    
## ps_car_10_cat1   -0.2393104  0.1587432  -1.508 0.131675    
## ps_car_10_cat2   -0.4094035  0.5460125  -0.750 0.453371    
## ps_car_11_cat2           NA         NA      NA       NA    
## ps_car_11_cat3    0.2624608  0.3614606   0.726 0.467770    
## ps_car_11_cat4    0.1837416  0.4191621   0.438 0.661129    
## ps_car_11_cat5   -0.2191734  0.3681665  -0.595 0.551636    
## ps_car_11_cat6    0.1217609  0.4326330   0.281 0.778372    
## ps_car_11_cat7   -0.3737126  0.3853882  -0.970 0.332194    
## ps_car_11_cat8   -0.1257166  0.5767899  -0.218 0.827461    
## ps_car_11_cat9    0.4722852  0.4329196   1.091 0.275303    
## ps_car_11_cat10   0.3824015  0.3287681   1.163 0.244775    
## ps_car_11_cat11   0.0926293  0.3606638   0.257 0.797310    
## ps_car_11_cat12   0.3305078  0.3717581   0.889 0.373982    
## ps_car_11_cat13   0.0551506  0.3991552   0.138 0.890107    
## ps_car_11_cat14  -0.2640741  0.3863642  -0.683 0.494301    
## ps_car_11_cat15  -0.4361432  0.4592278  -0.950 0.342249    
## ps_car_11_cat16  -0.1720470  0.3884628  -0.443 0.657844    
## ps_car_11_cat17   0.3179591  0.3683271   0.863 0.387999    
## ps_car_11_cat18   0.5586660  0.5171986   1.080 0.280063    
## ps_car_11_cat19  -0.0148769  0.4668969  -0.032 0.974581    
## ps_car_11_cat20   0.3078829  0.3973474   0.775 0.438431    
## ps_car_11_cat21   0.5163162  0.3774567   1.368 0.171349    
## ps_car_11_cat22   0.2467961  0.3922995   0.629 0.529283    
## ps_car_11_cat23   0.1731514  0.4390037   0.394 0.693272    
## ps_car_11_cat24   0.6381500  0.4064964   1.570 0.116443    
## ps_car_11_cat25          NA         NA      NA       NA    
## ps_car_11_cat26   0.6005595  0.4524755   1.327 0.184418    
## ps_car_11_cat27   0.0976000  0.4152154   0.235 0.814163    
## ps_car_11_cat28  -0.1404650  0.4255275  -0.330 0.741327    
## ps_car_11_cat29   0.2180112  0.3945215   0.553 0.580540    
## ps_car_11_cat30  -0.2074416  0.4321914  -0.480 0.631244    
## ps_car_11_cat31   0.3628709  0.3461550   1.048 0.294505    
## ps_car_11_cat32   0.2105151  0.4054109   0.519 0.603577    
## ps_car_11_cat33   0.4084406  0.4177141   0.978 0.328174    
## ps_car_11_cat34   0.4861664  0.4261988   1.141 0.253993    
## ps_car_11_cat35   0.2545913  0.4023102   0.633 0.526849    
## ps_car_11_cat36   0.0217332  0.3730744   0.058 0.953546    
## ps_car_11_cat37   0.4509687  0.4131443   1.092 0.275030    
## ps_car_11_cat38   0.2944047  0.3290714   0.895 0.370973    
## ps_car_11_cat39   0.1019806  0.8073178   0.126 0.899478    
## ps_car_11_cat40   0.2949767  0.3462650   0.852 0.394280    
## ps_car_11_cat41   0.4999593  0.4041197   1.237 0.216029    
## ps_car_11_cat42   0.1754668  0.3913187   0.448 0.653866    
## ps_car_11_cat43   0.3567644  0.4894941   0.729 0.466098    
## ps_car_11_cat44  -0.0985402  0.3882864  -0.254 0.799664    
## ps_car_11_cat45   0.4415902  0.3990850   1.107 0.268507    
## ps_car_11_cat46   0.1285349  0.4194152   0.306 0.759253    
## ps_car_11_cat47   0.3699766  0.3640711   1.016 0.309524    
## ps_car_11_cat48   0.7824686  0.3828173   2.044 0.040956 *  
## ps_car_11_cat49  -0.3163666  0.5765421  -0.549 0.583190    
## ps_car_11_cat50   0.6437453  0.4154474   1.550 0.121256    
## ps_car_11_cat51   0.3466914  0.3151767   1.100 0.271336    
## ps_car_11_cat52   0.0055149  0.3644157   0.015 0.987926    
## ps_car_11_cat53   0.0224636  0.3919363   0.057 0.954295    
## ps_car_11_cat54   0.3343409  0.4357886   0.767 0.442957    
## ps_car_11_cat55  -0.2433775  0.4279113  -0.569 0.569521    
## ps_car_11_cat56   0.3437517  0.4063898   0.846 0.397627    
## ps_car_11_cat57  -0.0360624  0.4166325  -0.087 0.931024    
## ps_car_11_cat58  -0.1746096  0.5087131  -0.343 0.731420    
## ps_car_11_cat59   0.3455564  0.4100507   0.843 0.399387    
## ps_car_11_cat60   0.3606602  0.3798503   0.949 0.342377    
## ps_car_11_cat61   0.2144730  0.3748132   0.572 0.567178    
## ps_car_11_cat62   0.4579913  0.4369813   1.048 0.294602    
## ps_car_11_cat63  -0.0184914  0.4315940  -0.043 0.965826    
## ps_car_11_cat64   0.0752009  0.3184104   0.236 0.813296    
## ps_car_11_cat65   0.4184929  0.3336582   1.254 0.209749    
## ps_car_11_cat66  -0.0065788  0.8099641  -0.008 0.993519    
## ps_car_11_cat67   0.1071411  0.3679326   0.291 0.770900    
## ps_car_11_cat68   0.1401974  0.3761000   0.373 0.709323    
## ps_car_11_cat69   0.0862253  0.3850537   0.224 0.822811    
## ps_car_11_cat70   0.1004369  0.3300266   0.304 0.760877    
## ps_car_11_cat71  -0.3114486  0.4352920  -0.715 0.474304    
## ps_car_11_cat72   0.2160514  0.3795251   0.569 0.569174    
## ps_car_11_cat73  -0.0720058  0.5336919  -0.135 0.892675    
## ps_car_11_cat74   0.4432853  0.3644502   1.216 0.223866    
## ps_car_11_cat75   0.5448350  0.3916660   1.391 0.164204    
## ps_car_11_cat76   0.2484173  0.3788230   0.656 0.511978    
## ps_car_11_cat77  -0.0114720  0.4640662  -0.025 0.980278    
## ps_car_11_cat78   0.2295606  0.3870219   0.593 0.553083    
## ps_car_11_cat79   0.1813696  0.3847235   0.471 0.637335    
## ps_car_11_cat80          NA         NA      NA       NA    
## ps_car_11_cat81   0.2042399  0.4453613   0.459 0.646526    
## ps_car_11_cat82   0.3587676  0.3836170   0.935 0.349673    
## ps_car_11_cat83  -0.1316017  0.4313369  -0.305 0.760289    
## ps_car_11_cat84   0.4557751  0.3874040   1.176 0.239401    
## ps_car_11_cat85   0.0468097  0.3274506   0.143 0.886328    
## ps_car_11_cat86   0.2342612  0.4387278   0.534 0.593372    
## ps_car_11_cat87   0.3428163  0.3859184   0.888 0.374372    
## ps_car_11_cat88   0.3194885  0.4162531   0.768 0.442764    
## ps_car_11_cat89  -0.2942349  0.4351912  -0.676 0.498974    
## ps_car_11_cat90   0.2297738  0.2941888   0.781 0.434778    
## ps_car_11_cat91   0.1241788  0.4037033   0.308 0.758387    
## ps_car_11_cat92   0.1485267  0.3757909   0.395 0.692668    
## ps_car_11_cat93   0.2289754  0.3678366   0.622 0.533618    
## ps_car_11_cat94  -0.0395119  0.3477924  -0.114 0.909549    
## ps_car_11_cat95  -0.1542952  0.4371093  -0.353 0.724096    
## ps_car_11_cat96   0.0285993  0.4271062   0.067 0.946613    
## ps_car_11_cat97   0.4176185  0.4005480   1.043 0.297125    
## ps_car_11_cat98   0.4220975  0.4341577   0.972 0.330940    
## ps_car_11_cat99   0.2024403  0.3617915   0.560 0.575787    
## ps_car_11_cat100  0.1531399  0.3566729   0.429 0.667664    
## ps_car_11_cat101  0.0265199  0.3593640   0.074 0.941172    
## ps_car_11_cat102  0.1619954  0.4047452   0.400 0.688979    
## ps_car_11_cat103  0.0264123  0.3481422   0.076 0.939525    
## ps_car_11_cat104  0.1783334  0.3048897   0.585 0.558607    
## ps_car_11         0.0340956  0.0400221   0.852 0.394260    
## ps_car_12         1.4803310  0.5152338   2.873 0.004064 ** 
## ps_car_13         0.6180815  0.2308837   2.677 0.007428 ** 
## ps_car_14        -0.2501866  0.9705557  -0.258 0.796579    
## ps_car_15         0.0259825  0.0480698   0.541 0.588842    
## ps_calc_01        0.0565804  0.0532935   1.062 0.288382    
## ps_calc_02        0.0492631  0.0534354   0.922 0.356571    
## ps_calc_03        0.0204197  0.0531730   0.384 0.700961    
## ps_calc_04        0.0105209  0.0136971   0.768 0.442422    
## ps_calc_05        0.0041077  0.0134937   0.304 0.760812    
## ps_calc_06        0.0067236  0.0114723   0.586 0.557825    
## ps_calc_07       -0.0032312  0.0108279  -0.298 0.765387    
## ps_calc_08       -0.0005066  0.0104784  -0.048 0.961439    
## ps_calc_09       -0.0025478  0.0122714  -0.208 0.835527    
## ps_calc_10        0.0090975  0.0052597   1.730 0.083688 .  
## ps_calc_11       -0.0100160  0.0065921  -1.519 0.128661    
## ps_calc_12       -0.0123006  0.0127813  -0.962 0.335851    
## ps_calc_13       -0.0104861  0.0090517  -1.158 0.246675    
## ps_calc_14        0.0011883  0.0055795   0.213 0.831351    
## ps_calc_15_bin   -0.0330426  0.0472098  -0.700 0.483984    
## ps_calc_16_bin    0.0423990  0.0318005   1.333 0.182440    
## ps_calc_17_bin   -0.0402382  0.0307318  -1.309 0.190420    
## ps_calc_18_bin    0.0025822  0.0338093   0.076 0.939120    
## ps_calc_19_bin   -0.0713602  0.0323710  -2.204 0.027493 *  
## ps_calc_20_bin   -0.0162370  0.0427031  -0.380 0.703775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 36954  on 99998  degrees of freedom
## Residual deviance: 35896  on 99799  degrees of freedom
## AIC: 36296
## 
## Number of Fisher Scoring iterations: 6

After building and summarizing hte model we will make predictions on the testing set.

preds <- predict(model, newdata = testing, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

Once we make the predictions, we analyze the accuracy by calculating Area Under the Curve (AUC)

p <- predict(model, newdata = testing, type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

pr <- prediction(p, testing$target)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]

auc

## [1] 0.6156464

It appears the logistic regression model has a approximately a 61.6% accuracy at prediction driver claims.

Machine Learning Model

Now we can turn to a machine learning algorithm to see if it can do a better job of predicting whetner a driver will file a claim or not.

In this case we will use Machine Learning code obtained from a Kaggle user who uses the xgboost package with caret tuning and gini scoring https://www.kaggle.com/captcalculator/r-xgboost-with-caret-tuning-and-gini-score/code

Xgboost, short for extreme gradient boosting, is a machine learning library which is used for building predictive tree models and is popular within the Kaggle community. The term boosing is a technique whereby new models are added to correct the errors made by existing models until no further improvement can be made.

The objective of the XGBoost model is to use a gradient descent algorithm http://www.onmyphd.com/?p=gradient.descent to minimize the loss when adding new models.

The code was run with the following results:

Fold1: eta=0.05, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.05, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.05, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.05, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.10, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.10, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.10, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold1: eta=0.10, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.05, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.05, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.05, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.05, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.10, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.10, max_depth=4, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.10, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350
Fold2: eta=0.10, max_depth=6, gamma=0.01, colsample_bytree=0.75, min_child_weight=0, subsample=0.5, nrounds=350

Aggregating results Selecting tuning parameters

Fitting nrounds = 350, max_depth = 4, eta = 0.05, gamma = 0.01, colsample_bytree = 0.75, min_child_weight = 0, subsample = 0.5 on full training set

print(Sys.time() - start) Time difference of 12.29886 mins

# make predictions

preds <- predict(xgbmod, newdata = x_test, type = “prob”) preds_final <- predict(xgbmod, newdata = dtest, type = “prob”)

# convert test target values back to numeric for gini and roc.plot functions

levels(y_test) <- c(“0”, “1”) y_test_raw <- as.numeric(levels(y_test))[y_test]

# Diagnostics

print(xgbmod$results) eta max_depth gamma colsample_bytree min_child_weight subsample nrounds NormalizedGini NormalizedGiniSD 1 0.05 4 0.01 0.75 0 0.5 350 0.2686265 0.007491148 3 0.10 4 0.01 0.75 0 0.5 350 0.2546387 0.002516451 2 0.05 6 0.01 0.75 0 0.5 350 0.2614964 0.004365784 4 0.10 6 0.01 0.75 0 0.5 350 0.2222005 0.001365411

print(xgbmod$resample) NormalizedGini Resample 1 0.2633295 Fold1 2 0.2739235 Fold2

# plot results (useful for larger tuning grids)

plot(xgbmod)

# score the predictions against test data

normalizedGini(y_test_raw, preds$Yes) [1] 0.2779311

To obtain an apples to apples comparison of accuracy between our logistic regression model and the Machine Learning algorithm created by the Kaggle user, one can convert the Gini score to AUC with the formula Gini = (2 * AUC) - 1:

Gini <- 0.2779311
AUC <- (Gini + 1) / 2
AUC

## [1] 0.6389655

The Area Under the Curve accuracy of the xgboost model is approximately 63.9%.

DATA606-Final Project

Michael D’Acampora

December 17, 2017

Part 1 - Introduction: Can a machine learning model be a better predictor of annual auto insurance claims than basic statistics?

Part 2 - Data:

Part 3 - Exploratory data analysis:

Machine Learning Model

Part 5 - Conclusion:

References: