DATA 621 – Business Analytics and Data Mining Homework #4 Assignment Requirements

Overview In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.

Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided).

library(tidyverse)
library(ggplot2)
library(mice)
library(car)
library(Hmisc)
library(corrplot)
library(pscl)
library(boot)
library(MASS)
library(caret)

Load data

# Load insurance csv
df_ins_raw <- read.csv("insurance_training_data.csv")

# Removing index as instructed
df_ins_raw <- subset(df_ins_raw, select = -c(INDEX))

# Preview data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1…
## $ TARGET_AMT  <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 4021.0…
## $ KIDSDRIV    <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AGE         <int> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53, 45…
## $ HOMEKIDS    <int> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 2, 1…
## $ YOJ         <int> 11, 11, 10, 14, NA, 12, NA, NA, 10, 7, 14, 5, 11, 11, 0, 1…
## $ INCOME      <chr> "$67,349", "$91,449", "$16,039", "", "$114,986", "$125,301…
## $ PARENT1     <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL    <chr> "$0", "$257,252", "$124,191", "$306,251", "$243,925", "$0"…
## $ MSTATUS     <chr> "z_No", "z_No", "Yes", "Yes", "Yes", "z_No", "Yes", "Yes",…
## $ SEX         <chr> "M", "M", "z_F", "M", "z_F", "z_F", "z_F", "M", "z_F", "M"…
## $ EDUCATION   <chr> "PhD", "z_High School", "z_High School", "<High School", "…
## $ JOB         <chr> "Professional", "z_Blue Collar", "Clerical", "z_Blue Colla…
## $ TRAVTIME    <int> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64, 48,…
## $ CAR_USE     <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK    <chr> "$14,230", "$14,940", "$4,010", "$15,440", "$18,000", "$17…
## $ TIF         <int> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7, 4, …
## $ CAR_TYPE    <chr> "Minivan", "Minivan", "z_SUV", "Minivan", "z_SUV", "Sports…
## $ RED_CAR     <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM    <chr> "$4,461", "$0", "$38,690", "$0", "$19,217", "$0", "$0", "$…
## $ CLM_FREQ    <int> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2…
## $ REVOKED     <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS     <int> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0, 0, …
## $ CAR_AGE     <int> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13, 16,…
## $ URBANICITY  <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…

DATA CLEANING

Fix formatting

remove_z <-  function(x){
  str_replace(x, 'z_', '')
}

# Remove extraneous z_
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_z))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
remove_dollar <-  function(x){
  str_replace(x, '\\$', '')
}

# Remove dollar sign from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_dollar))

remove_comma <- function(x){
  str_replace(x, ',', '')
}

# Remove commas from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_comma))

# Preview updated data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <chr> "0", "0", "0", "0", "0", "1", "0", "1", "1", "0", "1", "0"…
## $ TARGET_AMT  <chr> "0", "0", "0", "0", "0", "2946", "0", "4021", "2501", "0",…
## $ KIDSDRIV    <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0"…
## $ AGE         <chr> "60", "43", "35", "51", "50", "34", "54", "37", "34", "50"…
## $ HOMEKIDS    <chr> "0", "0", "1", "0", "0", "1", "0", "2", "0", "0", "0", "0"…
## $ YOJ         <chr> "11", "11", "10", "14", NA, "12", NA, NA, "10", "7", "14",…
## $ INCOME      <chr> "67349", "91449", "16039", "", "114986", "125301", "18755"…
## $ PARENT1     <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL    <chr> "0", "257252", "124191", "306251", "243925", "0", "", "333…
## $ MSTATUS     <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No",…
## $ SEX         <chr> "M", "M", "F", "M", "F", "F", "F", "M", "F", "M", "F", "F"…
## $ EDUCATION   <chr> "PhD", "High School", "High School", "<High School", "PhD"…
## $ JOB         <chr> "Professional", "Blue Collar", "Clerical", "Blue Collar", …
## $ TRAVTIME    <chr> "14", "22", "5", "32", "36", "46", "33", "44", "34", "48",…
## $ CAR_USE     <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK    <chr> "14230", "14940", "4010", "15440", "18000", "17430", "8780…
## $ TIF         <chr> "11", "1", "4", "7", "1", "1", "1", "1", "1", "7", "1", "7…
## $ CAR_TYPE    <chr> "Minivan", "Minivan", "SUV", "Minivan", "SUV", "Sports Car…
## $ RED_CAR     <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM    <chr> "4461", "0", "38690", "0", "19217", "0", "0", "2374", "0",…
## $ CLM_FREQ    <chr> "2", "0", "2", "0", "2", "0", "0", "1", "0", "0", "0", "0"…
## $ REVOKED     <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS     <chr> "3", "0", "3", "0", "3", "0", "0", "10", "0", "1", "0", "0…
## $ CAR_AGE     <chr> "18", "1", "10", "6", "17", "7", "1", "7", "1", "17", "11"…
## $ URBANICITY  <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…

Review distinct values

# Count of distinct values for each column
df_ins_raw %>% summarise_all(n_distinct)
##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1           2       1949        5  61        6  22   6613       2     5107
##   MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR
## 1       2   2         5   9       97       2     2789  23        6       2
##   OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 1     2857        6       2      13      31          2
df_ins_raw %>% distinct(PARENT1)
##   PARENT1
## 1      No
## 2     Yes
df_ins_raw %>% distinct(MSTATUS)
##   MSTATUS
## 1      No
## 2     Yes
df_ins_raw %>% distinct(SEX)
##   SEX
## 1   M
## 2   F
df_ins_raw %>% distinct(EDUCATION)
##      EDUCATION
## 1          PhD
## 2  High School
## 3 <High School
## 4    Bachelors
## 5      Masters
df_ins_raw %>% distinct(JOB)
##            JOB
## 1 Professional
## 2  Blue Collar
## 3     Clerical
## 4       Doctor
## 5       Lawyer
## 6      Manager
## 7             
## 8   Home Maker
## 9      Student
df_ins_raw %>% distinct(CAR_USE)
##      CAR_USE
## 1    Private
## 2 Commercial
df_ins_raw %>% distinct(CAR_TYPE)
##      CAR_TYPE
## 1     Minivan
## 2         SUV
## 3  Sports Car
## 4         Van
## 5 Panel Truck
## 6      Pickup
df_ins_raw %>% distinct(CLM_FREQ)
##   CLM_FREQ
## 1        2
## 2        0
## 3        1
## 4        3
## 5        5
## 6        4
df_ins_raw %>% distinct(REVOKED)
##   REVOKED
## 1      No
## 2     Yes
df_ins_raw %>% distinct(URBANICITY)
##            URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Rural/ Rural

Convert datatypes

# Set data types for variables
df_ins_clean <- df_ins_raw %>% transform( 
               TARGET_FLAG = as.factor(TARGET_FLAG), 
               TARGET_AMT = as.numeric(TARGET_AMT),
               KIDSDRIV = as.factor(KIDSDRIV),
               AGE = as.numeric(AGE),
               HOMEKIDS = as.factor(HOMEKIDS),
               YOJ = as.numeric(YOJ),
               INCOME = as.numeric(INCOME),
               PARENT1 = as.factor(PARENT1),
               HOME_VAL = as.numeric(HOME_VAL),
               MSTATUS = as.factor(MSTATUS),
               SEX = as.factor(SEX),
               EDUCATION = as.factor(EDUCATION),
               JOB = as.factor(JOB),
               TRAVTIME = as.numeric(TRAVTIME),
               CAR_USE = as.factor(CAR_USE),
               BLUEBOOK = as.numeric(BLUEBOOK),
               TIF = as.numeric(TIF), # factor or numeric?
               CAR_TYPE = as.factor(CAR_TYPE),
               RED_CAR = as.factor(RED_CAR),
               OLDCLAIM = as.numeric(OLDCLAIM),
               CLM_FREQ = as.ordered(CLM_FREQ),  # factor or numeric?
               REVOKED = as.factor(REVOKED),
               MVR_PTS = as.numeric(MVR_PTS),
               CAR_AGE = as.numeric(CAR_AGE),
               URBANICITY = as.factor(URBANICITY))

# Confirm CLM_FREQ is an ordered factor
is.ordered(df_ins_clean$CLM_FREQ)
## [1] TRUE

Review NAs

# NA counts for each column
colSums(is.na(df_ins_clean))
## TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS         YOJ 
##           0           0           0           6           0         454 
##      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX   EDUCATION 
##         445           0         464           0           0           0 
##         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF    CAR_TYPE 
##           0           0           0           0           0           0 
##     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS     CAR_AGE 
##           0           0           0           0           0         510 
##  URBANICITY 
##           0
# Visualize NA counts for each column
df_ins_clean  %>%
  summarise_all(list(~is.na(.)))%>%
  pivot_longer(everything(),
               names_to = "variables", values_to="missing") %>%
  count(variables, missing) %>%
  ggplot(aes(y=variables,x=n,fill=missing))+
  geom_col()

Data imputation

# Impute data by regression: 
df_ins_imp <- mice(df_ins_clean, method = "norm.predict", m = 1, remove.collinear=FALSE)
## 
##  iter imp variable
##   1   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   2   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   3   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   4   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   5   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
df_ins_imp <- complete(df_ins_imp)

# Confirm no NAs remain
colSums(is.na(df_ins_imp))
## TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS         YOJ 
##           0           0           0           0           0           0 
##      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX   EDUCATION 
##           0           0           0           0           0           0 
##         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF    CAR_TYPE 
##           0           0           0           0           0           0 
##     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS     CAR_AGE 
##           0           0           0           0           0           0 
##  URBANICITY 
##           0

DATA EXPLORATION

Summary statistics

describe(df_ins_imp)
## df_ins_imp 
## 
##  25  Variables      8161  Observations
## --------------------------------------------------------------------------------
## TARGET_FLAG 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value          0     1
## Frequency   6008  2153
## Proportion 0.736 0.264
## --------------------------------------------------------------------------------
## TARGET_AMT 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     1949    0.601     1504     2574        0        0 
##      .25      .50      .75      .90      .95 
##        0        0     1036     4904     6452 
## 
## lowest :      0.00000     30.27728     58.53106     95.56732    108.74150
## highest:  73783.46592  77907.43028  78874.19056  85523.65335 107586.13616
## --------------------------------------------------------------------------------
## KIDSDRIV 
##        n  missing distinct 
##     8161        0        5 
## 
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##                                         
## Value          0     1     2     3     4
## Frequency   7180   636   279    62     4
## Proportion 0.880 0.078 0.034 0.008 0.000
## --------------------------------------------------------------------------------
## AGE 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       66    0.999    44.78    9.749       30       34 
##      .25      .50      .75      .90      .95 
##       39       45       51       56       59 
## 
## lowest : 16 17 18 19 20, highest: 72 73 76 80 81
## --------------------------------------------------------------------------------
## HOMEKIDS 
##        n  missing distinct 
##     8161        0        6 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##                                               
## Value          0     1     2     3     4     5
## Frequency   5289   902  1118   674   164    14
## Proportion 0.648 0.111 0.137 0.083 0.020 0.002
## --------------------------------------------------------------------------------
## YOJ 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0      475    0.991     10.5      4.2        0        5 
##      .25      .50      .75      .90      .95 
##        9       11       13       14       15 
## 
## lowest :  0.000000  1.000000  2.000000  2.748179  3.000000
## highest: 16.348246 17.000000 18.000000 19.000000 23.000000
## --------------------------------------------------------------------------------
## INCOME 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     7057        1    61595    51009        0     4362 
##      .25      .50      .75      .90      .95 
##    27940    54216    85472   122744   151663 
## 
## lowest : -31968.52 -26991.28 -20478.35 -16829.40 -16713.97
## highest: 306277.00 309628.00 320127.00 332339.00 367030.00
## --------------------------------------------------------------------------------
## PARENT1 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value         No   Yes
## Frequency   7084  1077
## Proportion 0.868 0.132
## --------------------------------------------------------------------------------
## HOME_VAL 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     5570    0.978   154934   142397        0        0 
##      .25      .50      .75      .90      .95 
##        0   160953   237604   314151   373031 
## 
## lowest : -86662.47 -71570.86 -71165.20 -70648.14 -68150.79
## highest: 657804.00 682634.00 738153.00 750455.00 885282.00
## --------------------------------------------------------------------------------
## MSTATUS 
##        n  missing distinct 
##     8161        0        2 
##                     
## Value        No  Yes
## Frequency  3267 4894
## Proportion  0.4  0.6
## --------------------------------------------------------------------------------
## SEX 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value          F     M
## Frequency   4375  3786
## Proportion 0.536 0.464
## --------------------------------------------------------------------------------
## EDUCATION 
##        n  missing distinct 
##     8161        0        5 
## 
## lowest : <High School Bachelors    High School  Masters      PhD         
## highest: <High School Bachelors    High School  Masters      PhD         
##                                                                            
## Value      <High School    Bachelors  High School      Masters          PhD
## Frequency          1203         2242         2330         1658          728
## Proportion        0.147        0.275        0.286        0.203        0.089
## --------------------------------------------------------------------------------
## JOB 
##        n  missing distinct 
##     8161        0        9 
## 
## lowest :              Blue Collar  Clerical     Doctor       Home Maker  
## highest: Home Maker   Lawyer       Manager      Professional Student     
##                                                                            
## Value                    Blue Collar     Clerical       Doctor   Home Maker
## Frequency           526         1825         1271          246          641
## Proportion        0.064        0.224        0.156        0.030        0.079
##                                                               
## Value            Lawyer      Manager Professional      Student
## Frequency           835          988         1117          712
## Proportion        0.102        0.121        0.137        0.087
## --------------------------------------------------------------------------------
## TRAVTIME 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       97        1    33.49    17.85        7       13 
##      .25      .50      .75      .90      .95 
##       22       33       44       54       60 
## 
## lowest :   5   6   7   8   9, highest: 103 113 124 134 142
## --------------------------------------------------------------------------------
## CAR_USE 
##        n  missing distinct 
##     8161        0        2 
##                                 
## Value      Commercial    Private
## Frequency        3029       5132
## Proportion      0.371      0.629
## --------------------------------------------------------------------------------
## BLUEBOOK 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     2789        1    15710     9354     4900     6000 
##      .25      .50      .75      .90      .95 
##     9280    14440    20850    27460    31110 
## 
## lowest :  1500  1520  1530  1540  1590, highest: 57970 61050 62240 65970 69740
## --------------------------------------------------------------------------------
## TIF 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       23    0.961    5.351    4.512        1        1 
##      .25      .50      .75      .90      .95 
##        1        4        7       11       13 
## 
## lowest :  1  2  3  4  5, highest: 19 20 21 22 25
## --------------------------------------------------------------------------------
## CAR_TYPE 
##        n  missing distinct 
##     8161        0        6 
## 
## lowest : Minivan     Panel Truck Pickup      Sports Car  SUV        
## highest: Panel Truck Pickup      Sports Car  SUV         Van        
##                                                                       
## Value          Minivan Panel Truck      Pickup  Sports Car         SUV
## Frequency         2145         676        1389         907        2294
## Proportion       0.263       0.083       0.170       0.111       0.281
##                       
## Value              Van
## Frequency          750
## Proportion       0.092
## --------------------------------------------------------------------------------
## RED_CAR 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value         no   yes
## Frequency   5783  2378
## Proportion 0.709 0.291
## --------------------------------------------------------------------------------
## OLDCLAIM 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     2857    0.769     4037     6563        0        0 
##      .25      .50      .75      .90      .95 
##        0        0     4636     9583    27090 
## 
## lowest :     0   502   506   518   519, highest: 52507 53477 53568 53986 57037
## --------------------------------------------------------------------------------
## CLM_FREQ 
##        n  missing distinct 
##     8161        0        6 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##                                               
## Value          0     1     2     3     4     5
## Frequency   5009   997  1171   776   190    18
## Proportion 0.614 0.122 0.143 0.095 0.023 0.002
## --------------------------------------------------------------------------------
## REVOKED 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value         No   Yes
## Frequency   7161  1000
## Proportion 0.877 0.123
## --------------------------------------------------------------------------------
## MVR_PTS 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       13      0.9    1.696    2.187        0        0 
##      .25      .50      .75      .90      .95 
##        0        1        3        5        6 
## 
## lowest :  0  1  2  3  4, highest:  8  9 10 11 13
##                                                                             
## Value          0     1     2     3     4     5     6     7     8     9    10
## Frequency   3712  1157   948   758   599   399   266   167    84    45    13
## Proportion 0.455 0.142 0.116 0.093 0.073 0.049 0.033 0.020 0.010 0.006 0.002
##                       
## Value         11    13
## Frequency     11     2
## Proportion 0.001 0.000
## --------------------------------------------------------------------------------
## CAR_AGE 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0      540    0.985    8.347    6.374    1.000    1.000 
##      .25      .50      .75      .90      .95 
##    3.514    8.000   12.000   16.000   18.000 
## 
## lowest : -3.0000000 -1.1184462 -0.9423218  0.0000000  1.0000000
## highest: 24.0000000 25.0000000 26.0000000 27.0000000 28.0000000
## --------------------------------------------------------------------------------
## URBANICITY 
##        n  missing distinct 
##     8161        0        2 
##                                                   
## Value      Highly Rural/ Rural Highly Urban/ Urban
## Frequency                 1669                6492
## Proportion               0.205               0.795
## --------------------------------------------------------------------------------

Distributions of variables

# Histograms
df_ins_imp %>%
  keep(is.numeric) %>% 
  gather() %>%
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density(fill = "steelblue", alpha=0.9, color="steelblue") +
    geom_histogram(aes(y=..density..), alpha=0.5, fill = "lightblue", color="lightblue", position="identity")

# Boxplots
df_ins_imp %>%
  keep(is.numeric) %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_boxplot(fill = "steelblue", color="black", outlier.colour="red", outlier.shape=16,
             outlier.size=2, notch=FALSE)

Distributions of log-transformed variables

# Log transformation
df_ins_imp_log <- df_ins_imp %>% keep(is.numeric)
df_ins_imp_log <- log(df_ins_imp_log + 1)

# Histograms of log transformed numeric variables
df_ins_imp_log %>%
  gather(variable, value, TARGET_AMT:CAR_AGE) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "steelblue", color="steelblue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

# Test for normality
shapiro.test(df_ins_imp_log$AGE[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_ins_imp_log$AGE[0:5000]
## W = 0.97949, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_log$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_log$AGE, col = "steelblue", lwd = 2)

Distributions of square root-transformed variables

# Square root transformation
df_ins_imp_sqrt <- sqrt(df_ins_imp %>% keep(is.numeric))

# Histograms of square root transformed numeric variables
df_ins_imp_sqrt %>%
  gather(variable, value, TARGET_AMT:CAR_AGE) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "steelblue", color="steelblue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

# Test for normality
shapiro.test(df_ins_imp_sqrt$AGE[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_ins_imp_sqrt$AGE[0:5000]
## W = 0.99371, p-value = 5.047e-14
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_sqrt$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_sqrt$AGE, col = "steelblue", lwd = 2)

Distributions of cube root-transformed variables

# Cube root transformation
df_ins_imp_cube <- (df_ins_imp %>% keep(is.numeric))^(1/3)

# Histograms of cube root transformed numeric variables
df_ins_imp_cube %>%
  gather(variable, value, TARGET_AMT:CAR_AGE) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "steelblue", color="steelblue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

# Test for normality
shapiro.test(df_ins_imp_cube$AGE[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_ins_imp_cube$AGE[0:5000]
## W = 0.98989, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_cube$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_cube$AGE, col = "steelblue", lwd = 2)

Create new columns with transformed variables

df_ins <- df_ins_imp %>%
                  mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, 
                                  TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(log = ~ log(. + 1))))

df_ins <- df_ins %>%
                  mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, 
                                  TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(sqrt = sqrt)))

df_ins <- df_ins %>%
                  mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, 
                                  TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(cbrt = ~ .^(1/3))))

glimpse(df_ins)
## Rows: 8,161
## Columns: 58
## $ TARGET_FLAG     <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ TARGET_AMT      <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 40…
## $ KIDSDRIV        <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AGE             <dbl> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53…
## $ HOMEKIDS        <fct> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, …
## $ YOJ             <dbl> 11.00000, 11.00000, 10.00000, 14.00000, 11.34200, 12.0…
## $ INCOME          <dbl> 67349.00, 91449.00, 16039.00, 67185.86, 114986.00, 125…
## $ PARENT1         <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ HOME_VAL        <dbl> 0.0, 257252.0, 124191.0, 306251.0, 243925.0, 0.0, 1654…
## $ MSTATUS         <fct> No, No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, …
## $ SEX             <fct> M, M, F, M, F, F, F, M, F, M, F, F, M, M, F, F, M, F, …
## $ EDUCATION       <fct> PhD, High School, High School, <High School, PhD, Bach…
## $ JOB             <fct> Professional, Blue Collar, Clerical, Blue Collar, Doct…
## $ TRAVTIME        <dbl> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64,…
## $ CAR_USE         <fct> Private, Commercial, Private, Private, Private, Commer…
## $ BLUEBOOK        <dbl> 14230, 14940, 4010, 15440, 18000, 17430, 8780, 16970, …
## $ TIF             <dbl> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7,…
## $ CAR_TYPE        <fct> Minivan, Minivan, SUV, Minivan, SUV, Sports Car, SUV, …
## $ RED_CAR         <fct> yes, yes, no, yes, no, no, no, yes, no, no, no, no, ye…
## $ OLDCLAIM        <dbl> 4461, 0, 38690, 0, 19217, 0, 0, 2374, 0, 0, 0, 0, 5028…
## $ CLM_FREQ        <ord> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, …
## $ REVOKED         <fct> No, No, No, No, Yes, No, No, Yes, No, No, No, No, Yes,…
## $ MVR_PTS         <dbl> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0,…
## $ CAR_AGE         <dbl> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13,…
## $ URBANICITY      <fct> Highly Urban/ Urban, Highly Urban/ Urban, Highly Urban…
## $ TARGET_AMT_log  <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 7.98…
## $ AGE_log         <dbl> 4.110874, 3.784190, 3.583519, 3.951244, 3.931826, 3.55…
## $ YOJ_log         <dbl> 2.484907, 2.484907, 2.397895, 2.708050, 2.513008, 2.56…
## $ INCOME_log      <dbl> 11.117658, 11.423548, 9.682841, 11.115233, 11.652574, …
## $ HOME_VAL_log    <dbl> 0.00000, 12.45782, 11.72958, 12.63216, 12.40462, 0.000…
## $ TRAVTIME_log    <dbl> 2.708050, 3.135494, 1.791759, 3.496508, 3.610918, 3.85…
## $ BLUEBOOK_log    <dbl> 9.563178, 9.611864, 8.296796, 9.644782, 9.798183, 9.76…
## $ TIF_log         <dbl> 2.4849066, 0.6931472, 1.6094379, 2.0794415, 0.6931472,…
## $ OLDCLAIM_log    <dbl> 8.403352, 0.000000, 10.563362, 0.000000, 9.863603, 0.0…
## $ MVR_PTS_log     <dbl> 1.3862944, 0.0000000, 1.3862944, 0.0000000, 1.3862944,…
## $ CAR_AGE_log     <dbl> 2.9444390, 0.6931472, 2.3978953, 1.9459101, 2.8903718,…
## $ TARGET_AMT_sqrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 54.27707,…
## $ AGE_sqrt        <dbl> 7.745967, 6.557439, 5.916080, 7.141428, 7.071068, 5.83…
## $ YOJ_sqrt        <dbl> 3.316625, 3.316625, 3.162278, 3.741657, 3.367789, 3.46…
## $ INCOME_sqrt     <dbl> 259.5169, 302.4054, 126.6452, 259.2024, 339.0959, 353.…
## $ HOME_VAL_sqrt   <dbl> 0.0000, 507.2002, 352.4074, 553.3995, 493.8876, 0.0000…
## $ TRAVTIME_sqrt   <dbl> 3.741657, 4.690416, 2.236068, 5.656854, 6.000000, 6.78…
## $ BLUEBOOK_sqrt   <dbl> 119.28956, 122.22929, 63.32456, 124.25780, 134.16408, …
## $ TIF_sqrt        <dbl> 3.316625, 1.000000, 2.000000, 2.645751, 1.000000, 1.00…
## $ OLDCLAIM_sqrt   <dbl> 66.79072, 0.00000, 196.69774, 0.00000, 138.62539, 0.00…
## $ MVR_PTS_sqrt    <dbl> 1.732051, 0.000000, 1.732051, 0.000000, 1.732051, 0.00…
## $ CAR_AGE_sqrt    <dbl> 4.242641, 1.000000, 3.162278, 2.449490, 4.123106, 2.64…
## $ TARGET_AMT_cbrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 14.33544,…
## $ AGE_cbrt        <dbl> 3.914868, 3.503398, 3.271066, 3.708430, 3.684031, 3.23…
## $ YOJ_cbrt        <dbl> 2.223980, 2.223980, 2.154435, 2.410142, 2.246794, 2.28…
## $ INCOME_cbrt     <dbl> 40.68588, 45.05327, 25.21888, 40.65300, 48.62747, 50.0…
## $ HOME_VAL_cbrt   <dbl> 0.00000, 63.59939, 49.89190, 67.40506, 62.48159, 0.000…
## $ TRAVTIME_cbrt   <dbl> 2.410142, 2.802039, 1.709976, 3.174802, 3.301927, 3.58…
## $ BLUEBOOK_cbrt   <dbl> 24.23269, 24.62919, 15.88723, 24.90094, 26.20741, 25.9…
## $ TIF_cbrt        <dbl> 2.223980, 1.000000, 1.587401, 1.912931, 1.000000, 1.00…
## $ OLDCLAIM_cbrt   <dbl> 16.46180, 0.00000, 33.82202, 0.00000, 26.78522, 0.0000…
## $ MVR_PTS_cbrt    <dbl> 1.442250, 0.000000, 1.442250, 0.000000, 1.442250, 0.00…
## $ CAR_AGE_cbrt    <dbl> 2.620741, 1.000000, 2.154435, 1.817121, 2.571282, 1.91…

Create new variables

df_ins$HV_INC_RATIO <- df_ins$HOME_VAL / df_ins$INCOME

df_ins$TRT_MVR_PRODUCT <- df_ins$TRAVTIME * df_ins$MVR_PTS

df_ins$HV_INC_RATIO[is.nan(df_ins$HV_INC_RATIO)] <- 0
df_ins$HV_INC_RATIO[is.infinite(df_ins$HV_INC_RATIO)] <- 0

Data imputation again

# Impute data by regression: 
df_ins <- mice(df_ins, method = "norm.predict", m = 1, remove.collinear=FALSE)
## 
##  iter imp variable
##   1   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt*  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   2   1  INCOME_log  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   3   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   4   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   5   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
## Warning: Number of logged events: 85
df_ins <- complete(df_ins)

# Confirm no NAs remain
colSums(is.na(df_ins))
##     TARGET_FLAG      TARGET_AMT        KIDSDRIV             AGE        HOMEKIDS 
##               0               0               0               0               0 
##             YOJ          INCOME         PARENT1        HOME_VAL         MSTATUS 
##               0               0               0               0               0 
##             SEX       EDUCATION             JOB        TRAVTIME         CAR_USE 
##               0               0               0               0               0 
##        BLUEBOOK             TIF        CAR_TYPE         RED_CAR        OLDCLAIM 
##               0               0               0               0               0 
##        CLM_FREQ         REVOKED         MVR_PTS         CAR_AGE      URBANICITY 
##               0               0               0               0               0 
##  TARGET_AMT_log         AGE_log         YOJ_log      INCOME_log    HOME_VAL_log 
##               0               0               0               0               0 
##    TRAVTIME_log    BLUEBOOK_log         TIF_log    OLDCLAIM_log     MVR_PTS_log 
##               0               0               0               0               0 
##     CAR_AGE_log TARGET_AMT_sqrt        AGE_sqrt        YOJ_sqrt     INCOME_sqrt 
##               0               0               0               0               0 
##   HOME_VAL_sqrt   TRAVTIME_sqrt   BLUEBOOK_sqrt        TIF_sqrt   OLDCLAIM_sqrt 
##               0               0               0               0               0 
##    MVR_PTS_sqrt    CAR_AGE_sqrt TARGET_AMT_cbrt        AGE_cbrt        YOJ_cbrt 
##               0               0               0               0               0 
##     INCOME_cbrt   HOME_VAL_cbrt   TRAVTIME_cbrt   BLUEBOOK_cbrt        TIF_cbrt 
##               0               0               0               0               0 
##   OLDCLAIM_cbrt    MVR_PTS_cbrt    CAR_AGE_cbrt    HV_INC_RATIO TRT_MVR_PRODUCT 
##               0               0               0               0               0

Correlation of variables

# Visualize correlation between variables
corrplot(cor(df_ins_imp %>% keep(is.numeric)), method="shade", shade.col=NA, tl.col="black", tl.srt=45)

# Reshape correlation results
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

# Closer look at correlations of variables
corr_results <- rcorr(as.matrix(df_ins_imp %>% keep(is.numeric)))
df_corr <- flattenCorrMatrix(corr_results$r, corr_results$P)

# Noteworthy positive correlations
df_corr %>% filter(cor > 0.4)
##      row   column       cor p
## 1 INCOME HOME_VAL 0.5905965 0
## 2 INCOME BLUEBOOK 0.4347123 0
## 3 INCOME  CAR_AGE 0.4259208 0
# Noteworthy negative correlations
df_corr %>% filter(cor < -0.4)
## [1] row    column cor    p     
## <0 rows> (or 0-length row.names)
# Pair plot
pairs(df_ins_imp %>% keep(is.numeric), lower.panel = NULL, col = "steelblue")

Check for multicollinearity

model <- glm(TARGET_AMT ~ KIDSDRIV + AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
            TIF + OLDCLAIM + MVR_PTS + CAR_AGE, data = df_ins, family = "quasipoisson")

vif(model)
##              GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV 1.024923  4        1.003082
## AGE      1.094997  1        1.046421
## YOJ      1.182086  1        1.087238
## INCOME   1.821524  1        1.349639
## HOME_VAL 1.330630  1        1.153529
## TRAVTIME 1.004598  1        1.002296
## BLUEBOOK 1.229378  1        1.108773
## TIF      1.004250  1        1.002123
## OLDCLAIM 1.080919  1        1.039673
## MVR_PTS  1.097636  1        1.047681
## CAR_AGE  1.252012  1        1.118933

Bucket select variables (by quantiles)

# 0-.25, .25-.75, .75-1
df_ins$CAR_AGE_fact <- cut(x = df_ins$CAR_AGE, breaks = c(-4, 3.5, 12, 28), labels = c("New", "Moderate", "Old"))

# -.5, .5-.9, .9+
df_ins$HOME_VAL_fact <- cut(x = df_ins$HOME_VAL, breaks = c(-86567, 160953, 314151, 885283), labels = c("No or Low", "Moderate", "High"))

# 0-.25, .25-.75, .75-1
df_ins$INCOME_fact <- cut(x = df_ins$INCOME, breaks = c(-31969, 27940, 85472, 367031), labels = c("Low", "Moderate", "High"))

# 0-.5, .50-.75, .75-1
df_ins$MVR_PTS_fact <- cut(x = df_ins$MVR_PTS, breaks = c(-1, 1, 3, 14), labels = c("Low", "Moderate", "High"))

# 0-.75, .75-1
df_ins$OLDCLAIM_fact <- cut(x = df_ins$OLDCLAIM, breaks = c(-1, 4636, 57038), labels = c("Low", "High"))

# 0-.25, .25-.75, .75-1
df_ins$TIF_fact <- cut(x = df_ins$TIF, breaks = c(-1, 1, 7, 26), labels = c("Low", "Moderate", "High"))

# 0-.25, .25-.75, .75-1
df_ins$TRAVTIME_fact <- cut(x = df_ins$TRAVTIME, breaks = c(4, 22, 44, 143), labels = c("Short", "Moderate", "Long"))

# 0-.25, .25-.75, .75-1
df_ins$YOJ_fact <- cut(x = df_ins$YOJ, breaks = c(-1, 9, 13, 24), labels = c("Low", "Moderate", "High"))

Binary Logistic Regression

Binomial NULL Model

# Remove the Target Columns and create new data frame

data_train <- df_ins %>% dplyr::select(-starts_with("TARGET"))
data_train$TARGET_FLAG <- df_ins$TARGET_FLAG
# Show new data frame
head(data_train)
##   KIDSDRIV AGE HOMEKIDS    YOJ    INCOME PARENT1 HOME_VAL MSTATUS SEX
## 1        0  60        0 11.000  67349.00      No        0      No   M
## 2        0  43        0 11.000  91449.00      No   257252      No   M
## 3        0  35        1 10.000  16039.00      No   124191     Yes   F
## 4        0  51        0 14.000  67185.86      No   306251     Yes   M
## 5        0  50        0 11.342 114986.00      No   243925     Yes   F
## 6        0  34        1 12.000 125301.00     Yes        0      No   F
##      EDUCATION          JOB TRAVTIME    CAR_USE BLUEBOOK TIF   CAR_TYPE RED_CAR
## 1          PhD Professional       14    Private    14230  11    Minivan     yes
## 2  High School  Blue Collar       22 Commercial    14940   1    Minivan     yes
## 3  High School     Clerical        5    Private     4010   4        SUV      no
## 4 <High School  Blue Collar       32    Private    15440   7    Minivan     yes
## 5          PhD       Doctor       36    Private    18000   1        SUV      no
## 6    Bachelors  Blue Collar       46 Commercial    17430   1 Sports Car      no
##   OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE          URBANICITY  AGE_log
## 1     4461        2      No       3      18 Highly Urban/ Urban 4.110874
## 2        0        0      No       0       1 Highly Urban/ Urban 3.784190
## 3    38690        2      No       3      10 Highly Urban/ Urban 3.583519
## 4        0        0      No       0       6 Highly Urban/ Urban 3.951244
## 5    19217        2     Yes       3      17 Highly Urban/ Urban 3.931826
## 6        0        0      No       0       7 Highly Urban/ Urban 3.555348
##    YOJ_log INCOME_log HOME_VAL_log TRAVTIME_log BLUEBOOK_log   TIF_log
## 1 2.484907  11.117658      0.00000     2.708050     9.563178 2.4849066
## 2 2.484907  11.423548     12.45782     3.135494     9.611864 0.6931472
## 3 2.397895   9.682841     11.72958     1.791759     8.296796 1.6094379
## 4 2.708050  11.115233     12.63216     3.496508     9.644782 2.0794415
## 5 2.513008  11.652574     12.40462     3.610918     9.798183 0.6931472
## 6 2.564949  11.738482      0.00000     3.850148     9.766006 0.6931472
##   OLDCLAIM_log MVR_PTS_log CAR_AGE_log AGE_sqrt YOJ_sqrt INCOME_sqrt
## 1     8.403352    1.386294   2.9444390 7.745967 3.316625    259.5169
## 2     0.000000    0.000000   0.6931472 6.557439 3.316625    302.4054
## 3    10.563362    1.386294   2.3978953 5.916080 3.162278    126.6452
## 4     0.000000    0.000000   1.9459101 7.141428 3.741657    259.2024
## 5     9.863603    1.386294   2.8903718 7.071068 3.367789    339.0959
## 6     0.000000    0.000000   2.0794415 5.830952 3.464102    353.9788
##   HOME_VAL_sqrt TRAVTIME_sqrt BLUEBOOK_sqrt TIF_sqrt OLDCLAIM_sqrt MVR_PTS_sqrt
## 1        0.0000      3.741657     119.28956 3.316625      66.79072     1.732051
## 2      507.2002      4.690416     122.22929 1.000000       0.00000     0.000000
## 3      352.4074      2.236068      63.32456 2.000000     196.69774     1.732051
## 4      553.3995      5.656854     124.25780 2.645751       0.00000     0.000000
## 5      493.8876      6.000000     134.16408 1.000000     138.62539     1.732051
## 6        0.0000      6.782330     132.02273 1.000000       0.00000     0.000000
##   CAR_AGE_sqrt AGE_cbrt YOJ_cbrt INCOME_cbrt HOME_VAL_cbrt TRAVTIME_cbrt
## 1     4.242641 3.914868 2.223980    40.68588       0.00000      2.410142
## 2     1.000000 3.503398 2.223980    45.05327      63.59939      2.802039
## 3     3.162278 3.271066 2.154435    25.21888      49.89190      1.709976
## 4     2.449490 3.708430 2.410142    40.65300      67.40506      3.174802
## 5     4.123106 3.684031 2.246794    48.62747      62.48159      3.301927
## 6     2.645751 3.239612 2.289428    50.04010       0.00000      3.583048
##   BLUEBOOK_cbrt TIF_cbrt OLDCLAIM_cbrt MVR_PTS_cbrt CAR_AGE_cbrt HV_INC_RATIO
## 1      24.23269 2.223980      16.46180      1.44225     2.620741     0.000000
## 2      24.62919 1.000000       0.00000      0.00000     1.000000     2.813065
## 3      15.88723 1.587401      33.82202      1.44225     2.154435     7.743064
## 4      24.90094 1.912931       0.00000      0.00000     1.817121     4.558265
## 5      26.20741 1.000000      26.78522      1.44225     2.571282     2.121345
## 6      25.92781 1.000000       0.00000      0.00000     1.912931     0.000000
##   TRT_MVR_PRODUCT CAR_AGE_fact HOME_VAL_fact INCOME_fact MVR_PTS_fact
## 1              42          Old     No or Low    Moderate     Moderate
## 2               0          New      Moderate        High          Low
## 3              15     Moderate     No or Low         Low     Moderate
## 4               0     Moderate      Moderate    Moderate          Low
## 5             108          Old      Moderate        High     Moderate
## 6               0     Moderate     No or Low        High          Low
##   OLDCLAIM_fact TIF_fact TRAVTIME_fact YOJ_fact TARGET_FLAG
## 1           Low     High         Short Moderate           0
## 2           Low      Low         Short Moderate           0
## 3          High Moderate         Short Moderate           0
## 4           Low Moderate      Moderate     High           0
## 5          High      Low      Moderate Moderate           0
## 6           Low      Low          Long Moderate           1

Build a Binary Logistic Regression Null model utilizing all the variables and data

# Build the binomial null regression
model1 <- glm(TARGET_FLAG ~ 1, data = data_train, family = binomial(link ="logit"))
summary(model1)
## 
## Call:
## glm(formula = TARGET_FLAG ~ 1, family = binomial(link = "logit"), 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7827  -0.7827  -0.7827   1.6325   1.6325  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.02623    0.02512  -40.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418  on 8160  degrees of freedom
## Residual deviance: 9418  on 8160  degrees of freedom
## AIC: 9420
## 
## Number of Fisher Scoring iterations: 4

Binomial FULL Model

Binary Logistic Regression Full model utilizing all the variables and data, This model will be considered to be valid..

#  Binomial FULL Model
model2 <- glm(TARGET_FLAG ~ ., data = data_train, family = binomial(link ="logit"))
summary(model2)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7570  -0.6937  -0.3802   0.5595   3.0348  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    2.644e+03  1.674e+03   1.580 0.114153    
## KIDSDRIV1                      7.110e-01  1.213e-01   5.861 4.60e-09 ***
## KIDSDRIV2                      9.772e-01  1.694e-01   5.768 8.03e-09 ***
## KIDSDRIV3                      1.202e+00  3.188e-01   3.770 0.000164 ***
## KIDSDRIV4                     -1.375e+01  3.050e+02  -0.045 0.964048    
## AGE                           -1.146e+01  8.468e+00  -1.354 0.175888    
## HOMEKIDS1                      1.227e-01  1.259e-01   0.975 0.329726    
## HOMEKIDS2                      6.168e-02  1.251e-01   0.493 0.622067    
## HOMEKIDS3                      2.502e-03  1.466e-01   0.017 0.986378    
## HOMEKIDS4                     -2.590e-01  2.307e-01  -1.123 0.261453    
## HOMEKIDS5                     -1.711e-01  7.107e-01  -0.241 0.809709    
## YOJ                            1.154e+00  1.007e+00   1.146 0.251735    
## INCOME                        -3.919e-06  4.591e-06  -0.854 0.393345    
## PARENT1Yes                     2.048e-01  1.248e-01   1.642 0.100602    
## HOME_VAL                       1.900e-05  1.223e-05   1.554 0.120168    
## MSTATUSYes                    -5.403e-01  9.374e-02  -5.764 8.20e-09 ***
## SEXM                           5.289e-02  1.154e-01   0.458 0.646660    
## EDUCATIONBachelors            -3.945e-01  1.279e-01  -3.084 0.002044 ** 
## EDUCATIONHigh School           8.298e-03  1.013e-01   0.082 0.934687    
## EDUCATIONMasters              -2.597e-01  1.937e-01  -1.341 0.179911    
## EDUCATIONPhD                  -2.523e-01  2.278e-01  -1.107 0.268153    
## JOBBlue Collar                 3.797e-01  1.893e-01   2.006 0.044842 *  
## JOBClerical                    4.460e-01  2.011e-01   2.217 0.026612 *  
## JOBDoctor                     -4.007e-01  2.715e-01  -1.476 0.139994    
## JOBHome Maker                  1.144e-01  2.338e-01   0.489 0.624562    
## JOBLawyer                      1.250e-01  1.725e-01   0.725 0.468677    
## JOBManager                    -5.487e-01  1.735e-01  -3.162 0.001565 ** 
## JOBProfessional                1.953e-01  1.815e-01   1.076 0.281725    
## JOBStudent                    -8.114e-03  2.429e-01  -0.033 0.973357    
## TRAVTIME                      -1.433e+00  1.940e+00  -0.739 0.460070    
## CAR_USEPrivate                -7.496e-01  9.385e-02  -7.988 1.38e-15 ***
## BLUEBOOK                      -3.345e-05  3.844e-04  -0.087 0.930651    
## TIF                           -1.378e-01  1.331e+00  -0.104 0.917545    
## CAR_TYPEPanel Truck            5.300e-01  1.721e-01   3.080 0.002068 ** 
## CAR_TYPEPickup                 5.651e-01  1.053e-01   5.365 8.10e-08 ***
## CAR_TYPESports Car             8.811e-01  1.356e-01   6.498 8.15e-11 ***
## CAR_TYPESUV                    7.117e-01  1.153e-01   6.174 6.65e-10 ***
## CAR_TYPEVan                    7.202e-01  1.299e-01   5.544 2.95e-08 ***
## RED_CARyes                    -4.457e-02  8.857e-02  -0.503 0.614837    
## OLDCLAIM                       1.268e-04  2.986e-04   0.425 0.670998    
## CLM_FREQ.L                     2.013e+00  7.812e+00   0.258 0.796678    
## CLM_FREQ.Q                    -1.377e+00  7.129e+00  -0.193 0.846812    
## CLM_FREQ.C                     1.141e+00  4.875e+00   0.234 0.814877    
## CLM_FREQ^4                    -5.284e-01  2.466e+00  -0.214 0.830376    
## CLM_FREQ^5                     1.345e-01  8.298e-01   0.162 0.871225    
## REVOKEDYes                     9.734e-01  9.681e-02  10.055  < 2e-16 ***
## MVR_PTS                        4.336e+00  3.281e+00   1.321 0.186409    
## CAR_AGE                       -6.396e-01  6.930e-01  -0.923 0.356053    
## URBANICITYHighly Urban/ Urban  2.381e+00  1.154e-01  20.623  < 2e-16 ***
## AGE_log                        1.438e+03  9.255e+02   1.554 0.120232    
## YOJ_log                        1.343e+01  9.797e+00   1.371 0.170480    
## INCOME_log                    -1.503e-01  8.356e-02  -1.799 0.072026 .  
## HOME_VAL_log                  -1.422e-01  5.284e-01  -0.269 0.787768    
## TRAVTIME_log                   1.359e+02  1.974e+02   0.689 0.491027    
## BLUEBOOK_log                   2.529e+00  8.014e+00   0.316 0.752282    
## TIF_log                       -2.048e+01  7.802e+01  -0.262 0.792939    
## OLDCLAIM_log                  -1.535e+00  4.854e+00  -0.316 0.751885    
## MVR_PTS_log                    2.054e+01  1.921e+01   1.069 0.284921    
## CAR_AGE_log                   -2.100e+01  2.213e+01  -0.949 0.342598    
## AGE_sqrt                       1.421e+03  9.606e+02   1.479 0.139108    
## YOJ_sqrt                      -1.754e+01  1.509e+01  -1.162 0.245089    
## INCOME_sqrt                   -3.440e-03  1.956e-03  -1.758 0.078730 .  
## HOME_VAL_sqrt                 -4.139e-02  4.380e-02  -0.945 0.344630    
## TRAVTIME_sqrt                  1.501e+02  2.127e+02   0.706 0.480497    
## BLUEBOOK_sqrt                  1.487e-01  6.558e-01   0.227 0.820649    
## TIF_sqrt                      -1.173e+01  4.618e+01  -0.254 0.799425    
## OLDCLAIM_sqrt                 -2.079e-01  4.642e-01  -0.448 0.654196    
## MVR_PTS_sqrt                  -5.085e+01  4.230e+01  -1.202 0.229380    
## CAR_AGE_sqrt                  -1.744e-01  4.768e+00  -0.037 0.970817    
## AGE_cbrt                      -4.771e+03  3.160e+03  -1.510 0.131055    
## YOJ_cbrt                       5.497e+00  6.919e+00   0.794 0.426921    
## INCOME_cbrt                    5.352e-02  2.965e-02   1.805 0.071055 .  
## HOME_VAL_cbrt                  2.710e-01  4.025e-01   0.673 0.500699    
## TRAVTIME_cbrt                 -4.819e+02  6.908e+02  -0.698 0.485390    
## BLUEBOOK_cbrt                 -1.398e+00  5.152e+00  -0.271 0.786099    
## TIF_cbrt                       5.336e+01  2.051e+02   0.260 0.794764    
## OLDCLAIM_cbrt                  1.458e+00  3.494e+00   0.417 0.676425    
## MVR_PTS_cbrt                   3.238e+01  2.575e+01   1.257 0.208662    
## CAR_AGE_cbrt                   3.638e+01  3.933e+01   0.925 0.354938    
## HV_INC_RATIO                  -1.182e-04  5.476e-04  -0.216 0.829104    
## TRT_MVR_PRODUCT                8.584e-04  8.556e-04   1.003 0.315729    
## CAR_AGE_factModerate          -7.225e-02  3.757e-01  -0.192 0.847514    
## CAR_AGE_factOld               -2.669e-01  3.712e-01  -0.719 0.472177    
## HOME_VAL_factModerate          2.404e-01  1.419e-01   1.694 0.090201 .  
## HOME_VAL_factHigh             -2.757e-02  2.661e-01  -0.104 0.917490    
## INCOME_factModerate           -2.045e-02  1.419e-01  -0.144 0.885450    
## INCOME_factHigh               -2.882e-01  2.151e-01  -1.340 0.180276    
## MVR_PTS_factModerate           1.127e-01  3.666e-01   0.307 0.758483    
## MVR_PTS_factHigh              -1.842e-02  4.533e-01  -0.041 0.967586    
## OLDCLAIM_factHigh             -1.771e-01  1.808e-01  -0.980 0.327165    
## TIF_factModerate              -6.625e-01  3.786e+00  -0.175 0.861082    
## TIF_factHigh                  -6.525e-01  3.807e+00  -0.171 0.863910    
## TRAVTIME_factModerate          1.851e-02  1.624e-01   0.114 0.909253    
## TRAVTIME_factLong              1.175e-01  2.181e-01   0.539 0.589975    
## YOJ_factModerate              -3.923e-02  1.515e-01  -0.259 0.795659    
## YOJ_factHigh                  -1.672e-02  2.262e-01  -0.074 0.941091    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9417.3  on 8159  degrees of freedom
## Residual deviance: 7093.4  on 8064  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 7285.4
## 
## Number of Fisher Scoring iterations: 9
plot(model2)

we notice how some variables are not statistically significant; for study purposes, I will assume that this is a valid model.

# Generate confidence intervals for regression slope
confint.default(model2)
##                                       2.5 %        97.5 %
## (Intercept)                   -6.362180e+02  5.923872e+03
## KIDSDRIV1                      4.732045e-01  9.487048e-01
## KIDSDRIV2                      6.451129e-01  1.309201e+00
## KIDSDRIV3                      5.768796e-01  1.826525e+00
## KIDSDRIV4                     -6.114412e+02  5.839504e+02
## AGE                           -2.805961e+01  5.135498e+00
## HOMEKIDS1                     -1.240693e-01  3.695306e-01
## HOMEKIDS2                     -1.835730e-01  3.069332e-01
## HOMEKIDS3                     -2.847629e-01  2.897677e-01
## HOMEKIDS4                     -7.111246e-01  1.930671e-01
## HOMEKIDS5                     -1.564175e+00  1.221881e+00
## YOJ                           -8.191140e-01  3.126319e+00
## INCOME                        -1.291731e-05  5.079623e-06
## PARENT1Yes                    -3.967547e-02  4.493644e-01
## HOME_VAL                      -4.962223e-06  4.296018e-05
## MSTATUSYes                    -7.240485e-01 -3.566076e-01
## SEXM                          -1.732516e-01  2.790348e-01
## EDUCATIONBachelors            -6.452560e-01 -1.437693e-01
## EDUCATIONHigh School          -1.901580e-01  2.067536e-01
## EDUCATIONMasters              -6.393571e-01  1.198794e-01
## EDUCATIONPhD                  -6.988592e-01  1.942674e-01
## JOBBlue Collar                 8.738496e-03  7.506131e-01
## JOBClerical                    5.173186e-02  8.401688e-01
## JOBDoctor                     -9.328680e-01  1.314544e-01
## JOBHome Maker                 -3.437751e-01  5.725869e-01
## JOBLawyer                     -2.130477e-01  4.629916e-01
## JOBManager                    -8.887088e-01 -2.086209e-01
## JOBProfessional               -1.603255e-01  5.509992e-01
## JOBStudent                    -4.842851e-01  4.680568e-01
## TRAVTIME                      -5.236229e+00  2.369533e+00
## CAR_USEPrivate                -9.335890e-01 -5.656976e-01
## BLUEBOOK                      -7.867878e-04  7.198887e-04
## TIF                           -2.746682e+00  2.471076e+00
## CAR_TYPEPanel Truck            1.927642e-01  8.671991e-01
## CAR_TYPEPickup                 3.586461e-01  7.715495e-01
## CAR_TYPESports Car             6.153094e-01  1.146841e+00
## CAR_TYPESUV                    4.857723e-01  9.376192e-01
## CAR_TYPEVan                    4.656173e-01  9.748397e-01
## RED_CARyes                    -2.181687e-01  1.290319e-01
## OLDCLAIM                      -4.583689e-04  7.120259e-04
## CLM_FREQ.L                    -1.329815e+01  1.732355e+01
## CLM_FREQ.Q                    -1.534988e+01  1.259540e+01
## CLM_FREQ.C                    -8.413659e+00  1.069658e+01
## CLM_FREQ^4                    -5.362575e+00  4.305839e+00
## CLM_FREQ^5                    -1.491954e+00  1.760994e+00
## REVOKEDYes                     7.836792e-01  1.163151e+00
## MVR_PTS                       -2.095844e+00  1.076738e+01
## CAR_AGE                       -1.997767e+00  7.186538e-01
## URBANICITYHighly Urban/ Urban  2.154584e+00  2.607123e+00
## AGE_log                       -3.759087e+02  3.252025e+03
## YOJ_log                       -5.773360e+00  3.263004e+01
## INCOME_log                    -3.140847e-01  1.345350e-02
## HOME_VAL_log                  -1.177879e+00  8.933819e-01
## TRAVTIME_log                  -2.509451e+02  5.228265e+02
## BLUEBOOK_log                  -1.317696e+01  1.823564e+01
## TIF_log                       -1.733964e+02  1.324364e+02
## OLDCLAIM_log                  -1.104889e+01  7.979491e+00
## MVR_PTS_log                   -1.710714e+01  5.818621e+01
## CAR_AGE_log                   -6.436881e+01  2.236902e+01
## AGE_sqrt                      -4.618867e+02  3.303507e+03
## YOJ_sqrt                      -4.710987e+01  1.203428e+01
## INCOME_sqrt                   -7.274239e-03  3.949321e-04
## HOME_VAL_sqrt                 -1.272442e-01  4.445449e-02
## TRAVTIME_sqrt                 -2.668314e+02  5.669610e+02
## BLUEBOOK_sqrt                 -1.136608e+00  1.433944e+00
## TIF_sqrt                      -1.022459e+02  7.877784e+01
## OLDCLAIM_sqrt                 -1.117782e+00  7.019014e-01
## MVR_PTS_sqrt                  -1.337636e+02  3.206705e+01
## CAR_AGE_sqrt                  -9.519973e+00  9.171098e+00
## AGE_cbrt                      -1.096350e+04  1.421850e+03
## YOJ_cbrt                      -8.064150e+00  1.905819e+01
## INCOME_cbrt                   -4.590843e-03  1.116302e-01
## HOME_VAL_cbrt                 -5.178084e-01  1.059848e+00
## TRAVTIME_cbrt                 -1.835801e+03  8.719587e+02
## BLUEBOOK_cbrt                 -1.149641e+01  8.699986e+00
## TIF_cbrt                      -3.487044e+02  4.554305e+02
## OLDCLAIM_cbrt                 -5.390425e+00  8.307127e+00
## MVR_PTS_cbrt                  -1.809788e+01  8.285727e+01
## CAR_AGE_cbrt                  -4.070100e+01  1.134621e+02
## HV_INC_RATIO                  -1.191545e-03  9.551320e-04
## TRT_MVR_PRODUCT               -8.185625e-04  2.535400e-03
## CAR_AGE_factModerate          -8.086301e-01  6.641377e-01
## CAR_AGE_factOld               -9.944909e-01  4.607052e-01
## HOME_VAL_factModerate         -3.769382e-02  5.185653e-01
## HOME_VAL_factHigh             -5.491165e-01  4.939827e-01
## INCOME_factModerate           -2.986477e-01  2.577507e-01
## INCOME_factHigh               -7.098153e-01  1.333773e-01
## MVR_PTS_factModerate          -6.057647e-01  8.311897e-01
## MVR_PTS_factHigh              -9.068555e-01  8.700156e-01
## OLDCLAIM_factHigh             -5.313949e-01  1.771656e-01
## TIF_factModerate              -8.082005e+00  6.757083e+00
## TIF_factHigh                  -8.114162e+00  6.809118e+00
## TRAVTIME_factModerate         -2.998275e-01  3.368535e-01
## TRAVTIME_factLong             -3.099097e-01  5.449424e-01
## YOJ_factModerate              -3.360938e-01  2.576436e-01
## YOJ_factHigh                  -4.600438e-01  4.266133e-01
# Generate the odds ratios
exp(coef(model2))
##                   (Intercept)                     KIDSDRIV1 
##                           Inf                  2.035934e+00 
##                     KIDSDRIV2                     KIDSDRIV3 
##                  2.656892e+00                  3.325774e+00 
##                     KIDSDRIV4                           AGE 
##                  1.072606e-06                  1.052188e-05 
##                     HOMEKIDS1                     HOMEKIDS2 
##                  1.130580e+00                  1.063622e+00 
##                     HOMEKIDS3                     HOMEKIDS4 
##                  1.002506e+00                  7.718008e-01 
##                     HOMEKIDS5                           YOJ 
##                  8.426974e-01                  3.169591e+00 
##                        INCOME                    PARENT1Yes 
##                  9.999961e-01                  1.227334e+00 
##                      HOME_VAL                    MSTATUSYes 
##                  1.000019e+00                  5.825571e-01 
##                          SEXM            EDUCATIONBachelors 
##                  1.054315e+00                  6.740085e-01 
##          EDUCATIONHigh School              EDUCATIONMasters 
##                  1.008332e+00                  7.712530e-01 
##                  EDUCATIONPhD                JOBBlue Collar 
##                  7.770148e-01                  1.461811e+00 
##                   JOBClerical                     JOBDoctor 
##                  1.561974e+00                  6.698464e-01 
##                 JOBHome Maker                     JOBLawyer 
##                  1.121207e+00                  1.133117e+00 
##                    JOBManager               JOBProfessional 
##                  5.777206e-01                  1.215720e+00 
##                    JOBStudent                      TRAVTIME 
##                  9.919187e-01                  2.385091e-01 
##                CAR_USEPrivate                      BLUEBOOK 
##                  4.725351e-01                  9.999666e-01 
##                           TIF           CAR_TYPEPanel Truck 
##                  8.712705e-01                  1.698901e+00 
##                CAR_TYPEPickup            CAR_TYPESports Car 
##                  1.759620e+00                  2.413493e+00 
##                   CAR_TYPESUV                   CAR_TYPEVan 
##                  2.037443e+00                  2.054903e+00 
##                    RED_CARyes                      OLDCLAIM 
##                  9.564102e-01                  1.000127e+00 
##                    CLM_FREQ.L                    CLM_FREQ.Q 
##                  7.483492e+00                  2.522747e-01 
##                    CLM_FREQ.C                    CLM_FREQ^4 
##                  3.131345e+00                  5.895664e-01 
##                    CLM_FREQ^5                    REVOKEDYes 
##                  1.143988e+00                  2.646968e+00 
##                       MVR_PTS                       CAR_AGE 
##                  7.638360e+01                  5.275264e-01 
## URBANICITYHighly Urban/ Urban                       AGE_log 
##                  1.081413e+01                           Inf 
##                       YOJ_log                    INCOME_log 
##                  6.789740e+05                  8.604364e-01 
##                  HOME_VAL_log                  TRAVTIME_log 
##                  8.674056e-01                  1.092162e+59 
##                  BLUEBOOK_log                       TIF_log 
##                  1.254522e+01                  1.275433e-09 
##                  OLDCLAIM_log                   MVR_PTS_log 
##                  2.155206e-01                  8.321629e+08 
##                   CAR_AGE_log                      AGE_sqrt 
##                  7.583379e-10                           Inf 
##                      YOJ_sqrt                   INCOME_sqrt 
##                  2.417865e-08                  9.965663e-01 
##                 HOME_VAL_sqrt                 TRAVTIME_sqrt 
##                  9.594502e-01                  1.487050e+65 
##                 BLUEBOOK_sqrt                      TIF_sqrt 
##                  1.160288e+00                  8.016185e-06 
##                 OLDCLAIM_sqrt                  MVR_PTS_sqrt 
##                  8.122554e-01                  8.258207e-23 
##                  CAR_AGE_sqrt                      AGE_cbrt 
##                  8.399296e-01                  0.000000e+00 
##                      YOJ_cbrt                   INCOME_cbrt 
##                  2.439635e+02                  1.054978e+00 
##                 HOME_VAL_cbrt                 TRAVTIME_cbrt 
##                  1.311301e+00                 5.060917e-210 
##                 BLUEBOOK_cbrt                      TIF_cbrt 
##                  2.470385e-01                  1.497195e+23 
##                 OLDCLAIM_cbrt                  MVR_PTS_cbrt 
##                  4.298866e+00                  1.154309e+14 
##                  CAR_AGE_cbrt                  HV_INC_RATIO 
##                  6.307658e+15                  9.998818e-01 
##               TRT_MVR_PRODUCT          CAR_AGE_factModerate 
##                  1.000859e+00                  9.303018e-01 
##               CAR_AGE_factOld         HOME_VAL_factModerate 
##                  7.657551e-01                  1.271803e+00 
##             HOME_VAL_factHigh           INCOME_factModerate 
##                  9.728096e-01                  9.797592e-01 
##               INCOME_factHigh          MVR_PTS_factModerate 
##                  7.495974e-01                  1.119310e+00 
##              MVR_PTS_factHigh             OLDCLAIM_factHigh 
##                  9.817487e-01                  8.376837e-01 
##              TIF_factModerate                  TIF_factHigh 
##                  5.155811e-01                  5.207307e-01 
##         TRAVTIME_factModerate             TRAVTIME_factLong 
##                  1.018685e+00                  1.124700e+00 
##              YOJ_factModerate                  YOJ_factHigh 
##                  9.615343e-01                  9.834237e-01
# Generate confidence intervals for regression slope
confint.default(model2)
##                                       2.5 %        97.5 %
## (Intercept)                   -6.362180e+02  5.923872e+03
## KIDSDRIV1                      4.732045e-01  9.487048e-01
## KIDSDRIV2                      6.451129e-01  1.309201e+00
## KIDSDRIV3                      5.768796e-01  1.826525e+00
## KIDSDRIV4                     -6.114412e+02  5.839504e+02
## AGE                           -2.805961e+01  5.135498e+00
## HOMEKIDS1                     -1.240693e-01  3.695306e-01
## HOMEKIDS2                     -1.835730e-01  3.069332e-01
## HOMEKIDS3                     -2.847629e-01  2.897677e-01
## HOMEKIDS4                     -7.111246e-01  1.930671e-01
## HOMEKIDS5                     -1.564175e+00  1.221881e+00
## YOJ                           -8.191140e-01  3.126319e+00
## INCOME                        -1.291731e-05  5.079623e-06
## PARENT1Yes                    -3.967547e-02  4.493644e-01
## HOME_VAL                      -4.962223e-06  4.296018e-05
## MSTATUSYes                    -7.240485e-01 -3.566076e-01
## SEXM                          -1.732516e-01  2.790348e-01
## EDUCATIONBachelors            -6.452560e-01 -1.437693e-01
## EDUCATIONHigh School          -1.901580e-01  2.067536e-01
## EDUCATIONMasters              -6.393571e-01  1.198794e-01
## EDUCATIONPhD                  -6.988592e-01  1.942674e-01
## JOBBlue Collar                 8.738496e-03  7.506131e-01
## JOBClerical                    5.173186e-02  8.401688e-01
## JOBDoctor                     -9.328680e-01  1.314544e-01
## JOBHome Maker                 -3.437751e-01  5.725869e-01
## JOBLawyer                     -2.130477e-01  4.629916e-01
## JOBManager                    -8.887088e-01 -2.086209e-01
## JOBProfessional               -1.603255e-01  5.509992e-01
## JOBStudent                    -4.842851e-01  4.680568e-01
## TRAVTIME                      -5.236229e+00  2.369533e+00
## CAR_USEPrivate                -9.335890e-01 -5.656976e-01
## BLUEBOOK                      -7.867878e-04  7.198887e-04
## TIF                           -2.746682e+00  2.471076e+00
## CAR_TYPEPanel Truck            1.927642e-01  8.671991e-01
## CAR_TYPEPickup                 3.586461e-01  7.715495e-01
## CAR_TYPESports Car             6.153094e-01  1.146841e+00
## CAR_TYPESUV                    4.857723e-01  9.376192e-01
## CAR_TYPEVan                    4.656173e-01  9.748397e-01
## RED_CARyes                    -2.181687e-01  1.290319e-01
## OLDCLAIM                      -4.583689e-04  7.120259e-04
## CLM_FREQ.L                    -1.329815e+01  1.732355e+01
## CLM_FREQ.Q                    -1.534988e+01  1.259540e+01
## CLM_FREQ.C                    -8.413659e+00  1.069658e+01
## CLM_FREQ^4                    -5.362575e+00  4.305839e+00
## CLM_FREQ^5                    -1.491954e+00  1.760994e+00
## REVOKEDYes                     7.836792e-01  1.163151e+00
## MVR_PTS                       -2.095844e+00  1.076738e+01
## CAR_AGE                       -1.997767e+00  7.186538e-01
## URBANICITYHighly Urban/ Urban  2.154584e+00  2.607123e+00
## AGE_log                       -3.759087e+02  3.252025e+03
## YOJ_log                       -5.773360e+00  3.263004e+01
## INCOME_log                    -3.140847e-01  1.345350e-02
## HOME_VAL_log                  -1.177879e+00  8.933819e-01
## TRAVTIME_log                  -2.509451e+02  5.228265e+02
## BLUEBOOK_log                  -1.317696e+01  1.823564e+01
## TIF_log                       -1.733964e+02  1.324364e+02
## OLDCLAIM_log                  -1.104889e+01  7.979491e+00
## MVR_PTS_log                   -1.710714e+01  5.818621e+01
## CAR_AGE_log                   -6.436881e+01  2.236902e+01
## AGE_sqrt                      -4.618867e+02  3.303507e+03
## YOJ_sqrt                      -4.710987e+01  1.203428e+01
## INCOME_sqrt                   -7.274239e-03  3.949321e-04
## HOME_VAL_sqrt                 -1.272442e-01  4.445449e-02
## TRAVTIME_sqrt                 -2.668314e+02  5.669610e+02
## BLUEBOOK_sqrt                 -1.136608e+00  1.433944e+00
## TIF_sqrt                      -1.022459e+02  7.877784e+01
## OLDCLAIM_sqrt                 -1.117782e+00  7.019014e-01
## MVR_PTS_sqrt                  -1.337636e+02  3.206705e+01
## CAR_AGE_sqrt                  -9.519973e+00  9.171098e+00
## AGE_cbrt                      -1.096350e+04  1.421850e+03
## YOJ_cbrt                      -8.064150e+00  1.905819e+01
## INCOME_cbrt                   -4.590843e-03  1.116302e-01
## HOME_VAL_cbrt                 -5.178084e-01  1.059848e+00
## TRAVTIME_cbrt                 -1.835801e+03  8.719587e+02
## BLUEBOOK_cbrt                 -1.149641e+01  8.699986e+00
## TIF_cbrt                      -3.487044e+02  4.554305e+02
## OLDCLAIM_cbrt                 -5.390425e+00  8.307127e+00
## MVR_PTS_cbrt                  -1.809788e+01  8.285727e+01
## CAR_AGE_cbrt                  -4.070100e+01  1.134621e+02
## HV_INC_RATIO                  -1.191545e-03  9.551320e-04
## TRT_MVR_PRODUCT               -8.185625e-04  2.535400e-03
## CAR_AGE_factModerate          -8.086301e-01  6.641377e-01
## CAR_AGE_factOld               -9.944909e-01  4.607052e-01
## HOME_VAL_factModerate         -3.769382e-02  5.185653e-01
## HOME_VAL_factHigh             -5.491165e-01  4.939827e-01
## INCOME_factModerate           -2.986477e-01  2.577507e-01
## INCOME_factHigh               -7.098153e-01  1.333773e-01
## MVR_PTS_factModerate          -6.057647e-01  8.311897e-01
## MVR_PTS_factHigh              -9.068555e-01  8.700156e-01
## OLDCLAIM_factHigh             -5.313949e-01  1.771656e-01
## TIF_factModerate              -8.082005e+00  6.757083e+00
## TIF_factHigh                  -8.114162e+00  6.809118e+00
## TRAVTIME_factModerate         -2.998275e-01  3.368535e-01
## TRAVTIME_factLong             -3.099097e-01  5.449424e-01
## YOJ_factModerate              -3.360938e-01  2.576436e-01
## YOJ_factHigh                  -4.600438e-01  4.266133e-01
# Generate the odds ratios
exp(coef(model2))
##                   (Intercept)                     KIDSDRIV1 
##                           Inf                  2.035934e+00 
##                     KIDSDRIV2                     KIDSDRIV3 
##                  2.656892e+00                  3.325774e+00 
##                     KIDSDRIV4                           AGE 
##                  1.072606e-06                  1.052188e-05 
##                     HOMEKIDS1                     HOMEKIDS2 
##                  1.130580e+00                  1.063622e+00 
##                     HOMEKIDS3                     HOMEKIDS4 
##                  1.002506e+00                  7.718008e-01 
##                     HOMEKIDS5                           YOJ 
##                  8.426974e-01                  3.169591e+00 
##                        INCOME                    PARENT1Yes 
##                  9.999961e-01                  1.227334e+00 
##                      HOME_VAL                    MSTATUSYes 
##                  1.000019e+00                  5.825571e-01 
##                          SEXM            EDUCATIONBachelors 
##                  1.054315e+00                  6.740085e-01 
##          EDUCATIONHigh School              EDUCATIONMasters 
##                  1.008332e+00                  7.712530e-01 
##                  EDUCATIONPhD                JOBBlue Collar 
##                  7.770148e-01                  1.461811e+00 
##                   JOBClerical                     JOBDoctor 
##                  1.561974e+00                  6.698464e-01 
##                 JOBHome Maker                     JOBLawyer 
##                  1.121207e+00                  1.133117e+00 
##                    JOBManager               JOBProfessional 
##                  5.777206e-01                  1.215720e+00 
##                    JOBStudent                      TRAVTIME 
##                  9.919187e-01                  2.385091e-01 
##                CAR_USEPrivate                      BLUEBOOK 
##                  4.725351e-01                  9.999666e-01 
##                           TIF           CAR_TYPEPanel Truck 
##                  8.712705e-01                  1.698901e+00 
##                CAR_TYPEPickup            CAR_TYPESports Car 
##                  1.759620e+00                  2.413493e+00 
##                   CAR_TYPESUV                   CAR_TYPEVan 
##                  2.037443e+00                  2.054903e+00 
##                    RED_CARyes                      OLDCLAIM 
##                  9.564102e-01                  1.000127e+00 
##                    CLM_FREQ.L                    CLM_FREQ.Q 
##                  7.483492e+00                  2.522747e-01 
##                    CLM_FREQ.C                    CLM_FREQ^4 
##                  3.131345e+00                  5.895664e-01 
##                    CLM_FREQ^5                    REVOKEDYes 
##                  1.143988e+00                  2.646968e+00 
##                       MVR_PTS                       CAR_AGE 
##                  7.638360e+01                  5.275264e-01 
## URBANICITYHighly Urban/ Urban                       AGE_log 
##                  1.081413e+01                           Inf 
##                       YOJ_log                    INCOME_log 
##                  6.789740e+05                  8.604364e-01 
##                  HOME_VAL_log                  TRAVTIME_log 
##                  8.674056e-01                  1.092162e+59 
##                  BLUEBOOK_log                       TIF_log 
##                  1.254522e+01                  1.275433e-09 
##                  OLDCLAIM_log                   MVR_PTS_log 
##                  2.155206e-01                  8.321629e+08 
##                   CAR_AGE_log                      AGE_sqrt 
##                  7.583379e-10                           Inf 
##                      YOJ_sqrt                   INCOME_sqrt 
##                  2.417865e-08                  9.965663e-01 
##                 HOME_VAL_sqrt                 TRAVTIME_sqrt 
##                  9.594502e-01                  1.487050e+65 
##                 BLUEBOOK_sqrt                      TIF_sqrt 
##                  1.160288e+00                  8.016185e-06 
##                 OLDCLAIM_sqrt                  MVR_PTS_sqrt 
##                  8.122554e-01                  8.258207e-23 
##                  CAR_AGE_sqrt                      AGE_cbrt 
##                  8.399296e-01                  0.000000e+00 
##                      YOJ_cbrt                   INCOME_cbrt 
##                  2.439635e+02                  1.054978e+00 
##                 HOME_VAL_cbrt                 TRAVTIME_cbrt 
##                  1.311301e+00                 5.060917e-210 
##                 BLUEBOOK_cbrt                      TIF_cbrt 
##                  2.470385e-01                  1.497195e+23 
##                 OLDCLAIM_cbrt                  MVR_PTS_cbrt 
##                  4.298866e+00                  1.154309e+14 
##                  CAR_AGE_cbrt                  HV_INC_RATIO 
##                  6.307658e+15                  9.998818e-01 
##               TRT_MVR_PRODUCT          CAR_AGE_factModerate 
##                  1.000859e+00                  9.303018e-01 
##               CAR_AGE_factOld         HOME_VAL_factModerate 
##                  7.657551e-01                  1.271803e+00 
##             HOME_VAL_factHigh           INCOME_factModerate 
##                  9.728096e-01                  9.797592e-01 
##               INCOME_factHigh          MVR_PTS_factModerate 
##                  7.495974e-01                  1.119310e+00 
##              MVR_PTS_factHigh             OLDCLAIM_factHigh 
##                  9.817487e-01                  8.376837e-01 
##              TIF_factModerate                  TIF_factHigh 
##                  5.155811e-01                  5.207307e-01 
##         TRAVTIME_factModerate             TRAVTIME_factLong 
##                  1.018685e+00                  1.124700e+00 
##              YOJ_factModerate                  YOJ_factHigh 
##                  9.615343e-01                  9.834237e-01

I will select the variables using the stepwise method

The ‘stepAIC’ function in R performs a stepwise model selection with an objective to minimize the AIC value.

Using Step in both direction

create multiple models using the STEP function from R.

Let’s check an ANOVA table based on the above testing results.

step_b$anova
##                 Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                    NA         NA      8160   9417.962 9419.962
## 2       + URBANICITY -1 502.103089      8159   8915.859 8919.859
## 3              + JOB -8 523.306410      8151   8392.553 8412.553
## 4  + TRT_MVR_PRODUCT -1 225.402921      8150   8167.150 8189.150
## 5    + HOME_VAL_sqrt -1 183.181057      8149   7983.969 8007.969
## 6         + CAR_TYPE -5 164.006081      8144   7819.963 7853.963
## 7          + REVOKED -1 103.395598      8143   7716.567 7752.567
## 8          + PARENT1 -1  75.136156      8142   7641.431 7679.431
## 9          + CAR_USE -1  65.145841      8141   7576.285 7616.285
## 10        + TIF_cbrt -1  62.057877      8140   7514.227 7556.227
## 11        + KIDSDRIV -4  52.579316      8136   7461.648 7511.648
## 12   + BLUEBOOK_cbrt -1  47.416795      8135   7414.231 7466.231
## 13        + CLM_FREQ -5  44.954091      8130   7369.277 7431.277
## 14     + INCOME_sqrt -1  28.595953      8129   7340.681 7404.681
## 15         + MSTATUS -1  25.603899      8128   7315.077 7381.077
## 16        + OLDCLAIM -1  24.623660      8127   7290.454 7358.454
## 17   + TRAVTIME_cbrt -1  23.079405      8126   7267.374 7337.374
## 18       + EDUCATION -4  20.765533      8122   7246.609 7324.609
## 19         + MVR_PTS -1   5.403444      8121   7241.205 7321.205
## 20    + MVR_PTS_fact -2  12.618083      8119   7228.587 7312.587
## 21         + AGE_log -1   3.752757      8118   7224.834 7310.834
## 22             + AGE -1  53.395704      8117   7171.439 7259.439
## 23        + AGE_cbrt -1  10.081550      8116   7161.357 7251.357
## 24 - TRT_MVR_PRODUCT  1   1.098658      8117   7162.456 7250.456
## 25     + INCOME_fact -2   4.657489      8115   7157.798 7249.798
## 26          + INCOME -1   4.462656      8114   7153.336 7247.336
## 27        + AGE_sqrt -1   2.892799      8113   7150.443 7246.443

From the above results, it shows that the best model is as follows:

summary(step_b)
## 
## Call:
## glm(formula = TARGET_FLAG ~ URBANICITY + JOB + HOME_VAL_sqrt + 
##     CAR_TYPE + REVOKED + PARENT1 + CAR_USE + TIF_cbrt + KIDSDRIV + 
##     BLUEBOOK_cbrt + CLM_FREQ + INCOME_sqrt + MSTATUS + OLDCLAIM + 
##     TRAVTIME_cbrt + EDUCATION + MVR_PTS + MVR_PTS_fact + AGE_log + 
##     AGE + AGE_cbrt + INCOME_fact + INCOME + AGE_sqrt, family = binomial(link = "logit"), 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5191  -0.6983  -0.3852   0.5951   3.0129  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    2.708e+03  1.588e+03   1.705 0.088128 .  
## URBANICITYHighly Urban/ Urban  2.372e+00  1.137e-01  20.852  < 2e-16 ***
## JOBBlue Collar                 3.654e-01  1.880e-01   1.943 0.051995 .  
## JOBClerical                    4.164e-01  1.996e-01   2.087 0.036899 *  
## JOBDoctor                     -4.017e-01  2.690e-01  -1.493 0.135354    
## JOBHome Maker                  1.373e-01  2.248e-01   0.611 0.541503    
## JOBLawyer                      1.282e-01  1.714e-01   0.748 0.454564    
## JOBManager                    -5.466e-01  1.725e-01  -3.169 0.001530 ** 
## JOBProfessional                2.012e-01  1.803e-01   1.116 0.264294    
## JOBStudent                    -1.414e-02  2.325e-01  -0.061 0.951493    
## HOME_VAL_sqrt                 -7.470e-04  1.880e-04  -3.975 7.05e-05 ***
## CAR_TYPEPanel Truck            5.933e-01  1.487e-01   3.989 6.62e-05 ***
## CAR_TYPEPickup                 5.563e-01  1.019e-01   5.458 4.83e-08 ***
## CAR_TYPESports Car             8.839e-01  1.102e-01   8.023 1.03e-15 ***
## CAR_TYPESUV                    6.899e-01  8.727e-02   7.906 2.66e-15 ***
## CAR_TYPEVan                    7.075e-01  1.241e-01   5.702 1.18e-08 ***
## REVOKEDYes                     9.525e-01  9.389e-02  10.146  < 2e-16 ***
## PARENT1Yes                     2.599e-01  1.030e-01   2.523 0.011650 *  
## CAR_USEPrivate                -7.653e-01  9.293e-02  -8.235  < 2e-16 ***
## TIF_cbrt                      -4.841e-01  6.173e-02  -7.842 4.42e-15 ***
## KIDSDRIV1                      7.536e-01  1.087e-01   6.930 4.20e-12 ***
## KIDSDRIV2                      9.728e-01  1.549e-01   6.281 3.37e-10 ***
## KIDSDRIV3                      1.123e+00  3.060e-01   3.669 0.000244 ***
## KIDSDRIV4                      1.294e+00  1.180e+00   1.097 0.272683    
## BLUEBOOK_cbrt                 -4.484e-02  8.006e-03  -5.600 2.14e-08 ***
## CLM_FREQ.L                     6.637e-01  3.319e-01   1.999 0.045561 *  
## CLM_FREQ.Q                    -1.503e-01  3.024e-01  -0.497 0.619125    
## CLM_FREQ.C                     2.802e-01  2.265e-01   1.237 0.216040    
## CLM_FREQ^4                    -1.117e-01  1.516e-01  -0.737 0.460942    
## CLM_FREQ^5                    -5.681e-03  9.494e-02  -0.060 0.952291    
## INCOME_sqrt                   -3.601e-03  1.050e-03  -3.428 0.000609 ***
## MSTATUSYes                    -5.321e-01  8.646e-02  -6.154 7.55e-10 ***
## OLDCLAIM                      -1.982e-05  4.251e-06  -4.662 3.13e-06 ***
## TRAVTIME_cbrt                  4.332e-01  5.460e-02   7.933 2.13e-15 ***
## EDUCATIONBachelors            -3.775e-01  1.131e-01  -3.338 0.000845 ***
## EDUCATIONHigh School           1.308e-02  9.831e-02   0.133 0.894120    
## EDUCATIONMasters              -2.531e-01  1.648e-01  -1.536 0.124610    
## EDUCATIONPhD                  -2.449e-01  2.057e-01  -1.190 0.233993    
## MVR_PTS                        1.856e-01  3.556e-02   5.220 1.79e-07 ***
## MVR_PTS_factModerate          -1.349e-01  1.077e-01  -1.252 0.210410    
## MVR_PTS_factHigh              -5.803e-01  1.958e-01  -2.964 0.003041 ** 
## AGE_log                        1.591e+03  8.996e+02   1.769 0.076909 .  
## AGE                           -1.276e+01  8.240e+00  -1.549 0.121400    
## AGE_cbrt                      -5.283e+03  3.072e+03  -1.720 0.085514 .  
## INCOME_factModerate            7.850e-02  1.190e-01   0.660 0.509474    
## INCOME_factHigh               -2.549e-01  1.924e-01  -1.325 0.185215    
## INCOME                         5.155e-06  2.416e-06   2.134 0.032828 *  
## AGE_sqrt                       1.575e+03  9.342e+02   1.685 0.091897 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7150.4  on 8113  degrees of freedom
## AIC: 7246.4
## 
## Number of Fisher Scoring iterations: 5

We see how all the predictors are statistical significant, also, noticing how the HOME_VAL and the INCOME are not as statistical significant compared to other variables.

The below plot shows our fitted models vs Density.

hist(step_b$fitted.values, main = " Histogram ",xlab = "Fitted models", col = 'skyblue3')

Show the predicted values

data_train$Predict <- ifelse(step_b$fitted.values >0.5,"pos","neg")
head(data_train$Predict)
## [1] "neg" "neg" "neg" "neg" "neg" "pos"

Confusion Matrix

Building a confusion matrix to get more insights.

step_c <- data_train
step_c$Predict = predict(step_b,type="response")
step_c$TARGET_FLAG_Predict <- round(step_c$Predict)
cMatrix <- confusionMatrix(data = as.factor(step_c$TARGET_FLAG_Predict),
                           reference = as.factor(step_c$TARGET_FLAG),
                           positive = '1')
cMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5546 1186
##          1  462  967
##                                           
##                Accuracy : 0.7981          
##                  95% CI : (0.7892, 0.8067)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4173          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4491          
##             Specificity : 0.9231          
##          Pos Pred Value : 0.6767          
##          Neg Pred Value : 0.8238          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1185          
##    Detection Prevalence : 0.1751          
##       Balanced Accuracy : 0.6861          
##                                           
##        'Positive' Class : 1               
## 

We can see the accuracy is 0.7981

data.frame(Value = cMatrix$byClass)
##                          Value
## Sensitivity          0.4491407
## Specificity          0.9231025
## Pos Pred Value       0.6766970
## Neg Pred Value       0.8238265
## Precision            0.6766970
## Recall               0.4491407
## F1                   0.5399218
## Prevalence           0.2638157
## Detection Rate       0.1184904
## Detection Prevalence 0.1751011
## Balanced Accuracy    0.6861216

Binary STEP MODIFIED Model

For the following variables I will add 1 and calculate the log, to avoid errors since some entries reported 0 and log(0) will produce errors, also I will remove the variable HOMEKIDS and KIDSDRIV

  • Log(1 + INCOME) - Log(1 + HOME_VAL) - Log(1 + BLUEBOOK) - Log(1 + OLDCLAIM) - HOMEKIDS Remove - KIDSDRIV Remove
step_modified <- glm(formula = TARGET_FLAG ~ KIDSDRIV +AGE+HOMEKIDS+YOJ+log(1 + INCOME)+PARENT1+ log(1+HOME_VAL) + MSTATUS+SEX+EDUCATION+JOB+TRAVTIME+CAR_USE+ log(1+BLUEBOOK) + TIF + CAR_TYPE + RED_CAR + log(1 + OLDCLAIM)+CLM_FREQ+REVOKED+URBANICITY+CAR_AGE, family = binomial(link = "logit"), data = data_train)
## Warning in log(1 + INCOME): NaNs produced
## Warning in log(1 + HOME_VAL): NaNs produced

Let’s see the summary:

summary(step_modified)
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + 
##     log(1 + INCOME) + PARENT1 + log(1 + HOME_VAL) + MSTATUS + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + log(1 + BLUEBOOK) + 
##     TIF + CAR_TYPE + RED_CAR + log(1 + OLDCLAIM) + CLM_FREQ + 
##     REVOKED + URBANICITY + CAR_AGE, family = binomial(link = "logit"), 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4049  -0.7144  -0.4006   0.6510   3.1601  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    1.974062   0.737363   2.677 0.007424 ** 
## KIDSDRIV1                      0.465540   0.114462   4.067 4.76e-05 ***
## KIDSDRIV2                      0.774529   0.164434   4.710 2.47e-06 ***
## KIDSDRIV3                      1.017934   0.316353   3.218 0.001292 ** 
## KIDSDRIV4                      1.306006   1.127510   1.158 0.246738    
## AGE                           -0.002913   0.004220  -0.690 0.489941    
## HOMEKIDS1                      0.336324   0.118736   2.833 0.004618 ** 
## HOMEKIDS2                      0.186599   0.117836   1.584 0.113296    
## HOMEKIDS3                      0.138269   0.137396   1.006 0.314245    
## HOMEKIDS4                     -0.064806   0.222983  -0.291 0.771333    
## HOMEKIDS5                      0.391761   0.655364   0.598 0.549990    
## YOJ                            0.023942   0.011540   2.075 0.038010 *  
## log(1 + INCOME)               -0.110011   0.019108  -5.757 8.54e-09 ***
## PARENT1Yes                     0.230930   0.121109   1.907 0.056546 .  
## log(1 + HOME_VAL)             -0.029475   0.007051  -4.181 2.91e-05 ***
## MSTATUSYes                    -0.551157   0.091142  -6.047 1.47e-09 ***
## SEXM                           0.123201   0.108533   1.135 0.256314    
## EDUCATIONBachelors            -0.403709   0.116647  -3.461 0.000538 ***
## EDUCATIONHigh School           0.011904   0.095893   0.124 0.901205    
## EDUCATIONMasters              -0.308092   0.181943  -1.693 0.090390 .  
## EDUCATIONPhD                  -0.333824   0.210341  -1.587 0.112498    
## JOBBlue Collar                 0.388576   0.184997   2.100 0.035690 *  
## JOBClerical                    0.517336   0.194101   2.665 0.007692 ** 
## JOBDoctor                     -0.399476   0.264143  -1.512 0.130446    
## JOBHome Maker                  0.008411   0.220423   0.038 0.969562    
## JOBLawyer                      0.151671   0.168561   0.900 0.368227    
## JOBManager                    -0.532654   0.169874  -3.136 0.001715 ** 
## JOBProfessional                0.227576   0.177783   1.280 0.200518    
## JOBStudent                    -0.089603   0.226668  -0.395 0.692618    
## TRAVTIME                       0.015232   0.001894   8.041 8.94e-16 ***
## CAR_USEPrivate                -0.776008   0.092521  -8.387  < 2e-16 ***
## log(1 + BLUEBOOK)             -0.283129   0.059411  -4.766 1.88e-06 ***
## TIF                           -0.056250   0.007385  -7.616 2.61e-14 ***
## CAR_TYPEPanel Truck            0.441112   0.150483   2.931 0.003375 ** 
## CAR_TYPEPickup                 0.597242   0.100732   5.929 3.05e-09 ***
## CAR_TYPESports Car             1.065882   0.128519   8.294  < 2e-16 ***
## CAR_TYPESUV                    0.825548   0.108027   7.642 2.14e-14 ***
## CAR_TYPEVan                    0.628435   0.125415   5.011 5.42e-07 ***
## RED_CARyes                    -0.004075   0.086455  -0.047 0.962408    
## log(1 + OLDCLAIM)             -0.161334   0.045645  -3.535 0.000409 ***
## CLM_FREQ.L                     1.529063   0.412015   3.711 0.000206 ***
## CLM_FREQ.Q                    -0.907783   0.373831  -2.428 0.015169 *  
## CLM_FREQ.C                     0.784731   0.272593   2.879 0.003992 ** 
## CLM_FREQ^4                    -0.412620   0.169157  -2.439 0.014717 *  
## CLM_FREQ^5                     0.086024   0.097661   0.881 0.378402    
## REVOKEDYes                     0.848297   0.088907   9.541  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.383108   0.114552  20.804  < 2e-16 ***
## CAR_AGE                       -0.004613   0.007804  -0.591 0.554515    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9337.1  on 8095  degrees of freedom
## Residual deviance: 7247.3  on 8048  degrees of freedom
##   (65 observations deleted due to missingness)
## AIC: 7343.3
## 
## Number of Fisher Scoring iterations: 5

With the transformation it looks slightly better the AIC is lower than the automatically selected model by the STEP procedure.

Plot of standardized residuals

The below plot shows our fitted models vs the deviance standardized residuals.

plot(fitted(step_modified),
     rstandard(step_modified),
     main = 'Standarize residuals',
     xlab = 'Fitted values',
     ylab = ' Residuals',
     col = 'blue')

Confusion Matrix

Let’s check teh confusion matrix.

step_modified_b <- step_modified
step_modified_b$Predict = predict(step_modified,type="response")
step_modified_b$TARGET_FLAG_Predict <- round(step_modified_b$Predict)
cMatrix <- confusionMatrix(data = as.factor(step_modified_b$TARGET_FLAG_Predict),
                           reference = as.factor(step_modified_b$TARGET_FLAG),
                           positive = '1')
cMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6769    0
##          1    0 1327
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9995, 1)
##     No Information Rate : 0.8361     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1639     
##          Detection Rate : 0.1639     
##    Detection Prevalence : 0.1639     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
## 

Is interesting to note that the reported Accuracy is

From the above results, we obtain as follows:

data.frame(Value = cMatrix$byClass)
##                          Value
## Sensitivity          1.0000000
## Specificity          1.0000000
## Pos Pred Value       1.0000000
## Neg Pred Value       1.0000000
## Precision            1.0000000
## Recall               1.0000000
## F1                   1.0000000
## Prevalence           0.1639081
## Detection Rate       0.1639081
## Detection Prevalence 0.1639081
## Balanced Accuracy    1.0000000