DATA 621 – Business Analytics and Data Mining Homework #4 Assignment Requirements

Overview In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.

Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided).

library(tidyverse)
library(ggplot2)
library(mice)
library(car)
library(Hmisc)
library(corrplot)
library(pscl)
library(boot)
library(MASS)

Load data

# Load insurance csv
df_ins_raw <- read.csv("insurance_training_data.csv")

# Removing index as instructed
df_ins_raw <- subset(df_ins_raw, select = -c(INDEX))

# Preview data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1…
## $ TARGET_AMT  <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 4021.0…
## $ KIDSDRIV    <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AGE         <int> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53, 45…
## $ HOMEKIDS    <int> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 2, 1…
## $ YOJ         <int> 11, 11, 10, 14, NA, 12, NA, NA, 10, 7, 14, 5, 11, 11, 0, 1…
## $ INCOME      <chr> "$67,349", "$91,449", "$16,039", "", "$114,986", "$125,301…
## $ PARENT1     <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL    <chr> "$0", "$257,252", "$124,191", "$306,251", "$243,925", "$0"…
## $ MSTATUS     <chr> "z_No", "z_No", "Yes", "Yes", "Yes", "z_No", "Yes", "Yes",…
## $ SEX         <chr> "M", "M", "z_F", "M", "z_F", "z_F", "z_F", "M", "z_F", "M"…
## $ EDUCATION   <chr> "PhD", "z_High School", "z_High School", "<High School", "…
## $ JOB         <chr> "Professional", "z_Blue Collar", "Clerical", "z_Blue Colla…
## $ TRAVTIME    <int> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64, 48,…
## $ CAR_USE     <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK    <chr> "$14,230", "$14,940", "$4,010", "$15,440", "$18,000", "$17…
## $ TIF         <int> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7, 4, …
## $ CAR_TYPE    <chr> "Minivan", "Minivan", "z_SUV", "Minivan", "z_SUV", "Sports…
## $ RED_CAR     <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM    <chr> "$4,461", "$0", "$38,690", "$0", "$19,217", "$0", "$0", "$…
## $ CLM_FREQ    <int> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2…
## $ REVOKED     <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS     <int> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0, 0, …
## $ CAR_AGE     <int> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13, 16,…
## $ URBANICITY  <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…

DATA CLEANING

Fix formatting

remove_z <-  function(x){
  str_replace(x, 'z_', '')
}

# Remove extraneous z_
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_z))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
remove_dollar <-  function(x){
  str_replace(x, '\\$', '')
}

# Remove dollar sign from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_dollar))

remove_comma <- function(x){
  str_replace(x, ',', '')
}

# Remove commas from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_comma))

# Preview updated data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <chr> "0", "0", "0", "0", "0", "1", "0", "1", "1", "0", "1", "0"…
## $ TARGET_AMT  <chr> "0", "0", "0", "0", "0", "2946", "0", "4021", "2501", "0",…
## $ KIDSDRIV    <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0"…
## $ AGE         <chr> "60", "43", "35", "51", "50", "34", "54", "37", "34", "50"…
## $ HOMEKIDS    <chr> "0", "0", "1", "0", "0", "1", "0", "2", "0", "0", "0", "0"…
## $ YOJ         <chr> "11", "11", "10", "14", NA, "12", NA, NA, "10", "7", "14",…
## $ INCOME      <chr> "67349", "91449", "16039", "", "114986", "125301", "18755"…
## $ PARENT1     <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL    <chr> "0", "257252", "124191", "306251", "243925", "0", "", "333…
## $ MSTATUS     <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No",…
## $ SEX         <chr> "M", "M", "F", "M", "F", "F", "F", "M", "F", "M", "F", "F"…
## $ EDUCATION   <chr> "PhD", "High School", "High School", "<High School", "PhD"…
## $ JOB         <chr> "Professional", "Blue Collar", "Clerical", "Blue Collar", …
## $ TRAVTIME    <chr> "14", "22", "5", "32", "36", "46", "33", "44", "34", "48",…
## $ CAR_USE     <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK    <chr> "14230", "14940", "4010", "15440", "18000", "17430", "8780…
## $ TIF         <chr> "11", "1", "4", "7", "1", "1", "1", "1", "1", "7", "1", "7…
## $ CAR_TYPE    <chr> "Minivan", "Minivan", "SUV", "Minivan", "SUV", "Sports Car…
## $ RED_CAR     <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM    <chr> "4461", "0", "38690", "0", "19217", "0", "0", "2374", "0",…
## $ CLM_FREQ    <chr> "2", "0", "2", "0", "2", "0", "0", "1", "0", "0", "0", "0"…
## $ REVOKED     <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS     <chr> "3", "0", "3", "0", "3", "0", "0", "10", "0", "1", "0", "0…
## $ CAR_AGE     <chr> "18", "1", "10", "6", "17", "7", "1", "7", "1", "17", "11"…
## $ URBANICITY  <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…

Review distinct values

# Count of distinct values for each column
df_ins_raw %>% summarise_all(n_distinct)
##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1           2       1949        5  61        6  22   6613       2     5107
##   MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR
## 1       2   2         5   9       97       2     2789  23        6       2
##   OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 1     2857        6       2      13      31          2
df_ins_raw %>% distinct(PARENT1)
##   PARENT1
## 1      No
## 2     Yes
df_ins_raw %>% distinct(MSTATUS)
##   MSTATUS
## 1      No
## 2     Yes
df_ins_raw %>% distinct(SEX)
##   SEX
## 1   M
## 2   F
df_ins_raw %>% distinct(EDUCATION)
##      EDUCATION
## 1          PhD
## 2  High School
## 3 <High School
## 4    Bachelors
## 5      Masters
df_ins_raw %>% distinct(JOB)
##            JOB
## 1 Professional
## 2  Blue Collar
## 3     Clerical
## 4       Doctor
## 5       Lawyer
## 6      Manager
## 7             
## 8   Home Maker
## 9      Student
df_ins_raw %>% distinct(CAR_USE)
##      CAR_USE
## 1    Private
## 2 Commercial
df_ins_raw %>% distinct(CAR_TYPE)
##      CAR_TYPE
## 1     Minivan
## 2         SUV
## 3  Sports Car
## 4         Van
## 5 Panel Truck
## 6      Pickup
df_ins_raw %>% distinct(CLM_FREQ)
##   CLM_FREQ
## 1        2
## 2        0
## 3        1
## 4        3
## 5        5
## 6        4
df_ins_raw %>% distinct(REVOKED)
##   REVOKED
## 1      No
## 2     Yes
df_ins_raw %>% distinct(URBANICITY)
##            URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Rural/ Rural

Convert datatypes

# Set data types for variables
df_ins_clean <- df_ins_raw %>% transform( 
               TARGET_FLAG = as.factor(TARGET_FLAG), 
               TARGET_AMT = as.numeric(TARGET_AMT),
               KIDSDRIV = as.factor(KIDSDRIV),
               AGE = as.numeric(AGE),
               HOMEKIDS = as.factor(HOMEKIDS),
               YOJ = as.numeric(YOJ),
               INCOME = as.numeric(INCOME),
               PARENT1 = as.factor(PARENT1),
               HOME_VAL = as.numeric(HOME_VAL),
               MSTATUS = as.factor(MSTATUS),
               SEX = as.factor(SEX),
               EDUCATION = as.factor(EDUCATION),
               JOB = as.factor(JOB),
               TRAVTIME = as.numeric(TRAVTIME),
               CAR_USE = as.factor(CAR_USE),
               BLUEBOOK = as.numeric(BLUEBOOK),
               TIF = as.numeric(TIF), # factor or numeric?
               CAR_TYPE = as.factor(CAR_TYPE),
               RED_CAR = as.factor(RED_CAR),
               OLDCLAIM = as.numeric(OLDCLAIM),
               CLM_FREQ = as.ordered(CLM_FREQ),  # factor or numeric?
               REVOKED = as.factor(REVOKED),
               MVR_PTS = as.numeric(MVR_PTS),
               CAR_AGE = as.numeric(CAR_AGE),
               URBANICITY = as.factor(URBANICITY))

# Confirm CLM_FREQ is an ordered factor
is.ordered(df_ins_clean$CLM_FREQ)
## [1] TRUE

Review NAs

# NA counts for each column
colSums(is.na(df_ins_clean))
## TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS         YOJ 
##           0           0           0           6           0         454 
##      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX   EDUCATION 
##         445           0         464           0           0           0 
##         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF    CAR_TYPE 
##           0           0           0           0           0           0 
##     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS     CAR_AGE 
##           0           0           0           0           0         510 
##  URBANICITY 
##           0
# Visualize NA counts for each column
df_ins_clean  %>%
  summarise_all(list(~is.na(.)))%>%
  pivot_longer(everything(),
               names_to = "variables", values_to="missing") %>%
  count(variables, missing) %>%
  ggplot(aes(y=variables,x=n,fill=missing))+
  geom_col()

Data imputation

# Impute data by regression: 
df_ins_imp <- mice(df_ins_clean, method = "norm.predict", m = 1, remove.collinear=FALSE)
## 
##  iter imp variable
##   1   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   2   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   3   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   4   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
##   5   1  AGE  YOJ  INCOME  HOME_VAL  CAR_AGE
df_ins_imp <- complete(df_ins_imp)

# Confirm no NAs remain
colSums(is.na(df_ins_imp))
## TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS         YOJ 
##           0           0           0           0           0           0 
##      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX   EDUCATION 
##           0           0           0           0           0           0 
##         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF    CAR_TYPE 
##           0           0           0           0           0           0 
##     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS     CAR_AGE 
##           0           0           0           0           0           0 
##  URBANICITY 
##           0

DATA EXPLORATION

Summary statistics

describe(df_ins_imp)
## df_ins_imp 
## 
##  25  Variables      8161  Observations
## --------------------------------------------------------------------------------
## TARGET_FLAG 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value          0     1
## Frequency   6008  2153
## Proportion 0.736 0.264
## --------------------------------------------------------------------------------
## TARGET_AMT 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     1949    0.601     1504     2574        0        0 
##      .25      .50      .75      .90      .95 
##        0        0     1036     4904     6452 
## 
## lowest :      0.00000     30.27728     58.53106     95.56732    108.74150
## highest:  73783.46592  77907.43028  78874.19056  85523.65335 107586.13616
## --------------------------------------------------------------------------------
## KIDSDRIV 
##        n  missing distinct 
##     8161        0        5 
## 
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##                                         
## Value          0     1     2     3     4
## Frequency   7180   636   279    62     4
## Proportion 0.880 0.078 0.034 0.008 0.000
## --------------------------------------------------------------------------------
## AGE 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       66    0.999    44.78    9.749       30       34 
##      .25      .50      .75      .90      .95 
##       39       45       51       56       59 
## 
## lowest : 16 17 18 19 20, highest: 72 73 76 80 81
## --------------------------------------------------------------------------------
## HOMEKIDS 
##        n  missing distinct 
##     8161        0        6 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##                                               
## Value          0     1     2     3     4     5
## Frequency   5289   902  1118   674   164    14
## Proportion 0.648 0.111 0.137 0.083 0.020 0.002
## --------------------------------------------------------------------------------
## YOJ 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0      475    0.991     10.5      4.2        0        5 
##      .25      .50      .75      .90      .95 
##        9       11       13       14       15 
## 
## lowest :  0.000000  1.000000  2.000000  2.748155  3.000000
## highest: 16.348253 17.000000 18.000000 19.000000 23.000000
## --------------------------------------------------------------------------------
## INCOME 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     7057        1    61594    51009        0     4362 
##      .25      .50      .75      .90      .95 
##    27940    54216    85472   122744   151663 
## 
## lowest : -31968.54 -26991.29 -20478.35 -16829.41 -16714.01
## highest: 306277.00 309628.00 320127.00 332339.00 367030.00
## --------------------------------------------------------------------------------
## PARENT1 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value         No   Yes
## Frequency   7084  1077
## Proportion 0.868 0.132
## --------------------------------------------------------------------------------
## HOME_VAL 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     5570    0.978   154934   142397        0        0 
##      .25      .50      .75      .90      .95 
##        0   160953   237604   314151   373031 
## 
## lowest : -86468.16 -71570.86 -71165.20 -70648.14 -68150.78
## highest: 657804.00 682634.00 738153.00 750455.00 885282.00
## --------------------------------------------------------------------------------
## MSTATUS 
##        n  missing distinct 
##     8161        0        2 
##                     
## Value        No  Yes
## Frequency  3267 4894
## Proportion  0.4  0.6
## --------------------------------------------------------------------------------
## SEX 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value          F     M
## Frequency   4375  3786
## Proportion 0.536 0.464
## --------------------------------------------------------------------------------
## EDUCATION 
##        n  missing distinct 
##     8161        0        5 
## 
## lowest : <High School Bachelors    High School  Masters      PhD         
## highest: <High School Bachelors    High School  Masters      PhD         
##                                                                            
## Value      <High School    Bachelors  High School      Masters          PhD
## Frequency          1203         2242         2330         1658          728
## Proportion        0.147        0.275        0.286        0.203        0.089
## --------------------------------------------------------------------------------
## JOB 
##        n  missing distinct 
##     8161        0        9 
## 
## lowest :              Blue Collar  Clerical     Doctor       Home Maker  
## highest: Home Maker   Lawyer       Manager      Professional Student     
##                                                                            
## Value                    Blue Collar     Clerical       Doctor   Home Maker
## Frequency           526         1825         1271          246          641
## Proportion        0.064        0.224        0.156        0.030        0.079
##                                                               
## Value            Lawyer      Manager Professional      Student
## Frequency           835          988         1117          712
## Proportion        0.102        0.121        0.137        0.087
## --------------------------------------------------------------------------------
## TRAVTIME 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       97        1    33.49    17.85        7       13 
##      .25      .50      .75      .90      .95 
##       22       33       44       54       60 
## 
## lowest :   5   6   7   8   9, highest: 103 113 124 134 142
## --------------------------------------------------------------------------------
## CAR_USE 
##        n  missing distinct 
##     8161        0        2 
##                                 
## Value      Commercial    Private
## Frequency        3029       5132
## Proportion      0.371      0.629
## --------------------------------------------------------------------------------
## BLUEBOOK 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     2789        1    15710     9354     4900     6000 
##      .25      .50      .75      .90      .95 
##     9280    14440    20850    27460    31110 
## 
## lowest :  1500  1520  1530  1540  1590, highest: 57970 61050 62240 65970 69740
## --------------------------------------------------------------------------------
## TIF 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       23    0.961    5.351    4.512        1        1 
##      .25      .50      .75      .90      .95 
##        1        4        7       11       13 
## 
## lowest :  1  2  3  4  5, highest: 19 20 21 22 25
## --------------------------------------------------------------------------------
## CAR_TYPE 
##        n  missing distinct 
##     8161        0        6 
## 
## lowest : Minivan     Panel Truck Pickup      Sports Car  SUV        
## highest: Panel Truck Pickup      Sports Car  SUV         Van        
##                                                                       
## Value          Minivan Panel Truck      Pickup  Sports Car         SUV
## Frequency         2145         676        1389         907        2294
## Proportion       0.263       0.083       0.170       0.111       0.281
##                       
## Value              Van
## Frequency          750
## Proportion       0.092
## --------------------------------------------------------------------------------
## RED_CAR 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value         no   yes
## Frequency   5783  2378
## Proportion 0.709 0.291
## --------------------------------------------------------------------------------
## OLDCLAIM 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0     2857    0.769     4037     6563        0        0 
##      .25      .50      .75      .90      .95 
##        0        0     4636     9583    27090 
## 
## lowest :     0   502   506   518   519, highest: 52507 53477 53568 53986 57037
## --------------------------------------------------------------------------------
## CLM_FREQ 
##        n  missing distinct 
##     8161        0        6 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##                                               
## Value          0     1     2     3     4     5
## Frequency   5009   997  1171   776   190    18
## Proportion 0.614 0.122 0.143 0.095 0.023 0.002
## --------------------------------------------------------------------------------
## REVOKED 
##        n  missing distinct 
##     8161        0        2 
##                       
## Value         No   Yes
## Frequency   7161  1000
## Proportion 0.877 0.123
## --------------------------------------------------------------------------------
## MVR_PTS 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0       13      0.9    1.696    2.187        0        0 
##      .25      .50      .75      .90      .95 
##        0        1        3        5        6 
## 
## lowest :  0  1  2  3  4, highest:  8  9 10 11 13
##                                                                             
## Value          0     1     2     3     4     5     6     7     8     9    10
## Frequency   3712  1157   948   758   599   399   266   167    84    45    13
## Proportion 0.455 0.142 0.116 0.093 0.073 0.049 0.033 0.020 0.010 0.006 0.002
##                       
## Value         11    13
## Frequency     11     2
## Proportion 0.001 0.000
## --------------------------------------------------------------------------------
## CAR_AGE 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     8161        0      540    0.985    8.347    6.374    1.000    1.000 
##      .25      .50      .75      .90      .95 
##    3.514    8.000   12.000   16.000   18.000 
## 
## lowest : -3.0000000 -1.1184480 -0.9423218  0.0000000  1.0000000
## highest: 24.0000000 25.0000000 26.0000000 27.0000000 28.0000000
## --------------------------------------------------------------------------------
## URBANICITY 
##        n  missing distinct 
##     8161        0        2 
##                                                   
## Value      Highly Rural/ Rural Highly Urban/ Urban
## Frequency                 1669                6492
## Proportion               0.205               0.795
## --------------------------------------------------------------------------------

Distributions of variables

# Histograms
df_ins_imp %>%
  keep(is.numeric) %>% 
  gather() %>%
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density(fill = "steelblue", alpha=0.9, color="steelblue") +
    geom_histogram(aes(y=..density..), alpha=0.5, fill = "lightblue", color="lightblue", position="identity")

# Boxplots
df_ins_imp %>%
  keep(is.numeric) %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_boxplot(fill = "steelblue", color="black", outlier.colour="red", outlier.shape=16,
             outlier.size=2, notch=FALSE)

Distributions of log-transformed variables

# Log transformation
df_ins_imp_log <- df_ins_imp %>% keep(is.numeric)
df_ins_imp_log <- log(df_ins_imp_log + 1)

# Histograms of log transformed numeric variables
df_ins_imp_log %>%
  gather(variable, value, TARGET_AMT:CAR_AGE) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "steelblue", color="steelblue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

# Test for normality
shapiro.test(df_ins_imp_log$AGE[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_ins_imp_log$AGE[0:5000]
## W = 0.97949, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_log$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_log$AGE, col = "steelblue", lwd = 2)

Distributions of square root-transformed variables

# Square root transformation
df_ins_imp_sqrt <- sqrt(df_ins_imp %>% keep(is.numeric))

# Histograms of square root transformed numeric variables
df_ins_imp_sqrt %>%
  gather(variable, value, TARGET_AMT:CAR_AGE) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "steelblue", color="steelblue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

# Test for normality
shapiro.test(df_ins_imp_sqrt$AGE[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_ins_imp_sqrt$AGE[0:5000]
## W = 0.99371, p-value = 5.047e-14
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_sqrt$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_sqrt$AGE, col = "steelblue", lwd = 2)

Distributions of cube root-transformed variables

# Cube root transformation
df_ins_imp_cube <- (df_ins_imp %>% keep(is.numeric))^(1/3)

# Histograms of cube root transformed numeric variables
df_ins_imp_cube %>%
  gather(variable, value, TARGET_AMT:CAR_AGE) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "steelblue", color="steelblue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

# Test for normality
shapiro.test(df_ins_imp_cube$AGE[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  df_ins_imp_cube$AGE[0:5000]
## W = 0.98989, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_cube$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_cube$AGE, col = "steelblue", lwd = 2)

Create new columns with transformed variables

df_ins <- df_ins_imp %>%
                  mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, 
                                  TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(log = ~ log(. + 1))))

df_ins <- df_ins %>%
                  mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, 
                                  TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(sqrt = sqrt)))

df_ins <- df_ins %>%
                  mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, 
                                  TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(cbrt = ~ .^(1/3))))

glimpse(df_ins)
## Rows: 8,161
## Columns: 58
## $ TARGET_FLAG     <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ TARGET_AMT      <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 40…
## $ KIDSDRIV        <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AGE             <dbl> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53…
## $ HOMEKIDS        <fct> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, …
## $ YOJ             <dbl> 11.00000, 11.00000, 10.00000, 14.00000, 11.34200, 12.0…
## $ INCOME          <dbl> 67349.00, 91449.00, 16039.00, 67185.86, 114986.00, 125…
## $ PARENT1         <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ HOME_VAL        <dbl> 0.0, 257252.0, 124191.0, 306251.0, 243925.0, 0.0, 1654…
## $ MSTATUS         <fct> No, No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, …
## $ SEX             <fct> M, M, F, M, F, F, F, M, F, M, F, F, M, M, F, F, M, F, …
## $ EDUCATION       <fct> PhD, High School, High School, <High School, PhD, Bach…
## $ JOB             <fct> Professional, Blue Collar, Clerical, Blue Collar, Doct…
## $ TRAVTIME        <dbl> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64,…
## $ CAR_USE         <fct> Private, Commercial, Private, Private, Private, Commer…
## $ BLUEBOOK        <dbl> 14230, 14940, 4010, 15440, 18000, 17430, 8780, 16970, …
## $ TIF             <dbl> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7,…
## $ CAR_TYPE        <fct> Minivan, Minivan, SUV, Minivan, SUV, Sports Car, SUV, …
## $ RED_CAR         <fct> yes, yes, no, yes, no, no, no, yes, no, no, no, no, ye…
## $ OLDCLAIM        <dbl> 4461, 0, 38690, 0, 19217, 0, 0, 2374, 0, 0, 0, 0, 5028…
## $ CLM_FREQ        <ord> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, …
## $ REVOKED         <fct> No, No, No, No, Yes, No, No, Yes, No, No, No, No, Yes,…
## $ MVR_PTS         <dbl> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0,…
## $ CAR_AGE         <dbl> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13,…
## $ URBANICITY      <fct> Highly Urban/ Urban, Highly Urban/ Urban, Highly Urban…
## $ TARGET_AMT_log  <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 7.98…
## $ AGE_log         <dbl> 4.110874, 3.784190, 3.583519, 3.951244, 3.931826, 3.55…
## $ YOJ_log         <dbl> 2.484907, 2.484907, 2.397895, 2.708050, 2.513008, 2.56…
## $ INCOME_log      <dbl> 11.117658, 11.423548, 9.682841, 11.115233, 11.652574, …
## $ HOME_VAL_log    <dbl> 0.00000, 12.45782, 11.72958, 12.63216, 12.40462, 0.000…
## $ TRAVTIME_log    <dbl> 2.708050, 3.135494, 1.791759, 3.496508, 3.610918, 3.85…
## $ BLUEBOOK_log    <dbl> 9.563178, 9.611864, 8.296796, 9.644782, 9.798183, 9.76…
## $ TIF_log         <dbl> 2.4849066, 0.6931472, 1.6094379, 2.0794415, 0.6931472,…
## $ OLDCLAIM_log    <dbl> 8.403352, 0.000000, 10.563362, 0.000000, 9.863603, 0.0…
## $ MVR_PTS_log     <dbl> 1.3862944, 0.0000000, 1.3862944, 0.0000000, 1.3862944,…
## $ CAR_AGE_log     <dbl> 2.9444390, 0.6931472, 2.3978953, 1.9459101, 2.8903718,…
## $ TARGET_AMT_sqrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 54.27707,…
## $ AGE_sqrt        <dbl> 7.745967, 6.557439, 5.916080, 7.141428, 7.071068, 5.83…
## $ YOJ_sqrt        <dbl> 3.316625, 3.316625, 3.162278, 3.741657, 3.367788, 3.46…
## $ INCOME_sqrt     <dbl> 259.5169, 302.4054, 126.6452, 259.2024, 339.0959, 353.…
## $ HOME_VAL_sqrt   <dbl> 0.0000, 507.2002, 352.4074, 553.3995, 493.8876, 0.0000…
## $ TRAVTIME_sqrt   <dbl> 3.741657, 4.690416, 2.236068, 5.656854, 6.000000, 6.78…
## $ BLUEBOOK_sqrt   <dbl> 119.28956, 122.22929, 63.32456, 124.25780, 134.16408, …
## $ TIF_sqrt        <dbl> 3.316625, 1.000000, 2.000000, 2.645751, 1.000000, 1.00…
## $ OLDCLAIM_sqrt   <dbl> 66.79072, 0.00000, 196.69774, 0.00000, 138.62539, 0.00…
## $ MVR_PTS_sqrt    <dbl> 1.732051, 0.000000, 1.732051, 0.000000, 1.732051, 0.00…
## $ CAR_AGE_sqrt    <dbl> 4.242641, 1.000000, 3.162278, 2.449490, 4.123106, 2.64…
## $ TARGET_AMT_cbrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 14.33544,…
## $ AGE_cbrt        <dbl> 3.914868, 3.503398, 3.271066, 3.708430, 3.684031, 3.23…
## $ YOJ_cbrt        <dbl> 2.223980, 2.223980, 2.154435, 2.410142, 2.246794, 2.28…
## $ INCOME_cbrt     <dbl> 40.68588, 45.05327, 25.21888, 40.65300, 48.62747, 50.0…
## $ HOME_VAL_cbrt   <dbl> 0.00000, 63.59939, 49.89190, 67.40506, 62.48159, 0.000…
## $ TRAVTIME_cbrt   <dbl> 2.410142, 2.802039, 1.709976, 3.174802, 3.301927, 3.58…
## $ BLUEBOOK_cbrt   <dbl> 24.23269, 24.62919, 15.88723, 24.90094, 26.20741, 25.9…
## $ TIF_cbrt        <dbl> 2.223980, 1.000000, 1.587401, 1.912931, 1.000000, 1.00…
## $ OLDCLAIM_cbrt   <dbl> 16.46180, 0.00000, 33.82202, 0.00000, 26.78522, 0.0000…
## $ MVR_PTS_cbrt    <dbl> 1.442250, 0.000000, 1.442250, 0.000000, 1.442250, 0.00…
## $ CAR_AGE_cbrt    <dbl> 2.620741, 1.000000, 2.154435, 1.817121, 2.571282, 1.91…

Create new variables

df_ins$HV_INC_RATIO <- df_ins$HOME_VAL / df_ins$INCOME

df_ins$TRT_MVR_PRODUCT <- df_ins$TRAVTIME * df_ins$MVR_PTS

df_ins$HV_INC_RATIO[is.nan(df_ins$HV_INC_RATIO)] <- 0
df_ins$HV_INC_RATIO[is.infinite(df_ins$HV_INC_RATIO)] <- 0

Data imputation again

# Impute data by regression: 
df_ins <- mice(df_ins, method = "norm.predict", m = 1, remove.collinear=FALSE)
## 
##  iter imp variable
##   1   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt*  HOME_VAL_sqrt*  CAR_AGE_sqrt  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   2   1  INCOME_log  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   3   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   4   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
##   5   1  INCOME_log*  HOME_VAL_log*  CAR_AGE_log*  INCOME_sqrt  HOME_VAL_sqrt*  CAR_AGE_sqrt*  INCOME_cbrt*  HOME_VAL_cbrt*  CAR_AGE_cbrt*
## Warning: Number of logged events: 83
df_ins <- complete(df_ins)

# Confirm no NAs remain
colSums(is.na(df_ins))
##     TARGET_FLAG      TARGET_AMT        KIDSDRIV             AGE        HOMEKIDS 
##               0               0               0               0               0 
##             YOJ          INCOME         PARENT1        HOME_VAL         MSTATUS 
##               0               0               0               0               0 
##             SEX       EDUCATION             JOB        TRAVTIME         CAR_USE 
##               0               0               0               0               0 
##        BLUEBOOK             TIF        CAR_TYPE         RED_CAR        OLDCLAIM 
##               0               0               0               0               0 
##        CLM_FREQ         REVOKED         MVR_PTS         CAR_AGE      URBANICITY 
##               0               0               0               0               0 
##  TARGET_AMT_log         AGE_log         YOJ_log      INCOME_log    HOME_VAL_log 
##               0               0               0               0               0 
##    TRAVTIME_log    BLUEBOOK_log         TIF_log    OLDCLAIM_log     MVR_PTS_log 
##               0               0               0               0               0 
##     CAR_AGE_log TARGET_AMT_sqrt        AGE_sqrt        YOJ_sqrt     INCOME_sqrt 
##               0               0               0               0               0 
##   HOME_VAL_sqrt   TRAVTIME_sqrt   BLUEBOOK_sqrt        TIF_sqrt   OLDCLAIM_sqrt 
##               0               0               0               0               0 
##    MVR_PTS_sqrt    CAR_AGE_sqrt TARGET_AMT_cbrt        AGE_cbrt        YOJ_cbrt 
##               0               0               0               0               0 
##     INCOME_cbrt   HOME_VAL_cbrt   TRAVTIME_cbrt   BLUEBOOK_cbrt        TIF_cbrt 
##               0               0               0               0               0 
##   OLDCLAIM_cbrt    MVR_PTS_cbrt    CAR_AGE_cbrt    HV_INC_RATIO TRT_MVR_PRODUCT 
##               0               0               0               0               0

Correlation of variables

# Visualize correlation between variables
corrplot(cor(df_ins_imp %>% keep(is.numeric)), method="shade", shade.col=NA, tl.col="black", tl.srt=45)

# Reshape correlation results
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

# Closer look at correlations of variables
corr_results <- rcorr(as.matrix(df_ins_imp %>% keep(is.numeric)))
df_corr <- flattenCorrMatrix(corr_results$r, corr_results$P)

# Noteworthy positive correlations
df_corr %>% filter(cor > 0.4)
##      row   column       cor p
## 1 INCOME HOME_VAL 0.5905964 0
## 2 INCOME BLUEBOOK 0.4347133 0
## 3 INCOME  CAR_AGE 0.4259203 0
# Noteworthy negative correlations
df_corr %>% filter(cor < -0.4)
## [1] row    column cor    p     
## <0 rows> (or 0-length row.names)
# Pair plot
pairs(df_ins_imp %>% keep(is.numeric), lower.panel = NULL, col = "steelblue")

Check for multicollinearity

model <- glm(TARGET_AMT ~ KIDSDRIV + AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
            TIF + OLDCLAIM + MVR_PTS + CAR_AGE, data = df_ins, family = "quasipoisson")

vif(model)
##              GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV 1.024923  4        1.003082
## AGE      1.094995  1        1.046420
## YOJ      1.182086  1        1.087238
## INCOME   1.821517  1        1.349636
## HOME_VAL 1.330625  1        1.153527
## TRAVTIME 1.004598  1        1.002296
## BLUEBOOK 1.229381  1        1.108774
## TIF      1.004250  1        1.002123
## OLDCLAIM 1.080919  1        1.039673
## MVR_PTS  1.097635  1        1.047681
## CAR_AGE  1.252010  1        1.118933

Bucket select variables (by quantiles)

# 0-.25, .25-.75, .75-1
df_ins$CAR_AGE_fact <- cut(x = df_ins$CAR_AGE, breaks = c(-4, 3.5, 12, 28), labels = c("New", "Moderate", "Old"))

# -.5, .5-.9, .9+
df_ins$HOME_VAL_fact <- cut(x = df_ins$HOME_VAL, breaks = c(-86567, 160953, 314151, 885283), labels = c("No or Low", "Moderate", "High"))

# 0-.25, .25-.75, .75-1
df_ins$INCOME_fact <- cut(x = df_ins$INCOME, breaks = c(-31969, 27940, 85472, 367031), labels = c("Low", "Moderate", "High"))

# 0-.5, .50-.75, .75-1
df_ins$MVR_PTS_fact <- cut(x = df_ins$MVR_PTS, breaks = c(-1, 1, 3, 14), labels = c("Low", "Moderate", "High"))

# 0-.75, .75-1
df_ins$OLDCLAIM_fact <- cut(x = df_ins$OLDCLAIM, breaks = c(-1, 4636, 57038), labels = c("Low", "High"))

# 0-.25, .25-.75, .75-1
df_ins$TIF_fact <- cut(x = df_ins$TIF, breaks = c(-1, 1, 7, 26), labels = c("Low", "Moderate", "High"))

# 0-.25, .25-.75, .75-1
df_ins$TRAVTIME_fact <- cut(x = df_ins$TRAVTIME, breaks = c(4, 22, 44, 143), labels = c("Short", "Moderate", "Long"))

# 0-.25, .25-.75, .75-1
df_ins$YOJ_fact <- cut(x = df_ins$YOJ, breaks = c(-1, 9, 13, 24), labels = c("Low", "Moderate", "High"))

Binary Logistic Regression

Binomial NULL Model

# remove set up the data
data_train <- df_ins[-c(2)]
model1 <- glm(TARGET_FLAG ~ 1, data = data_train, family = binomial(link ="logit"))
summary(model1)
## 
## Call:
## glm(formula = TARGET_FLAG ~ 1, family = binomial(link = "logit"), 
##     data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7827  -0.7827  -0.7827   1.6325   1.6325  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.02623    0.02512  -40.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418  on 8160  degrees of freedom
## Residual deviance: 9418  on 8160  degrees of freedom
## AIC: 9420
## 
## Number of Fisher Scoring iterations: 4

Binomial FULL Model

model2 <- glm(TARGET_FLAG ~ ., data = data_train, family = binomial(link ="logit"))
summary(model2)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = data_train)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -7.097e-06  -2.511e-06  -2.057e-06   2.100e-08   5.914e-05  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)                   -3.345e+03  2.612e+08   0.000    1.000
## KIDSDRIV1                     -1.735e-03  1.775e+04   0.000    1.000
## KIDSDRIV2                      1.838e-03  2.415e+04   0.000    1.000
## KIDSDRIV3                      3.525e-01  4.134e+04   0.000    1.000
## KIDSDRIV4                      6.280e-01  2.109e+05   0.000    1.000
## AGE                            1.580e+01  1.331e+06   0.000    1.000
## HOMEKIDS1                     -2.659e-01  1.695e+04   0.000    1.000
## HOMEKIDS2                     -2.706e-01  1.671e+04   0.000    1.000
## HOMEKIDS3                     -2.431e-01  1.965e+04   0.000    1.000
## HOMEKIDS4                     -2.266e-01  3.240e+04   0.000    1.000
## HOMEKIDS5                      3.096e-01  8.536e+04   0.000    1.000
## YOJ                           -9.961e-01  1.213e+05   0.000    1.000
## INCOME                        -2.680e-06  5.815e-01   0.000    1.000
## PARENT1Yes                     8.250e-02  1.769e+04   0.000    1.000
## HOME_VAL                       1.839e-05  1.682e+00   0.000    1.000
## MSTATUSYes                     2.509e-01  1.249e+04   0.000    1.000
## SEXM                           2.585e-01  1.448e+04   0.000    1.000
## EDUCATIONBachelors            -2.021e-01  1.755e+04   0.000    1.000
## EDUCATIONHigh School           1.813e-01  1.383e+04   0.000    1.000
## EDUCATIONMasters              -9.897e-02  2.503e+04   0.000    1.000
## EDUCATIONPhD                  -9.438e-02  2.969e+04   0.000    1.000
## JOBBlue Collar                 3.575e-01  2.507e+04   0.000    1.000
## JOBClerical                    1.292e-01  2.699e+04   0.000    1.000
## JOBDoctor                      8.911e-02  3.249e+04   0.000    1.000
## JOBHome Maker                  2.659e-01  3.086e+04   0.000    1.000
## JOBLawyer                      1.140e-01  2.315e+04   0.000    1.000
## JOBManager                     1.058e-01  2.243e+04   0.000    1.000
## JOBProfessional                1.443e-01  2.435e+04   0.000    1.000
## JOBStudent                     2.491e-01  3.278e+04   0.000    1.000
## TRAVTIME                       7.137e-01  1.220e+05   0.000    1.000
## CAR_USEPrivate                 1.721e-02  1.262e+04   0.000    1.000
## BLUEBOOK                      -1.516e-04  5.639e+01   0.000    1.000
## TIF                            2.338e-01  1.588e+05   0.000    1.000
## CAR_TYPEPanel Truck            7.391e-02  2.282e+04   0.000    1.000
## CAR_TYPEPickup                 2.934e-01  1.304e+04   0.000    1.000
## CAR_TYPESports Car            -4.077e-02  1.802e+04   0.000    1.000
## CAR_TYPESUV                   -4.845e-02  1.490e+04   0.000    1.000
## CAR_TYPEVan                    7.548e-03  1.730e+04   0.000    1.000
## RED_CARyes                    -3.638e-01  1.137e+04   0.000    1.000
## OLDCLAIM                      -8.819e-04  4.414e+01   0.000    1.000
## CLM_FREQ.L                    -3.199e+01  1.185e+06   0.000    1.000
## CLM_FREQ.Q                     2.932e+01  1.083e+06   0.000    1.000
## CLM_FREQ.C                    -2.007e+01  7.391e+05   0.000    1.000
## CLM_FREQ^4                     1.052e+01  3.747e+05   0.000    1.000
## CLM_FREQ^5                    -2.941e+00  1.259e+05   0.000    1.000
## REVOKEDYes                     4.190e-02  1.429e+04   0.000    1.000
## MVR_PTS                        9.022e-02  4.037e+05   0.000    1.000
## CAR_AGE                       -8.269e-03  2.020e+04   0.000    1.000
## URBANICITYHighly Urban/ Urban  5.374e-02  1.152e+04   0.000    1.000
## TARGET_AMT_log                 2.716e+01  3.854e+04   0.001    0.999
## AGE_log                       -1.586e+03  1.485e+08   0.000    1.000
## YOJ_log                       -7.349e+00  1.126e+06   0.000    1.000
## INCOME_log                    -1.404e-01  1.161e+04   0.000    1.000
## HOME_VAL_log                  -7.806e-01  7.806e+04   0.000    1.000
## TRAVTIME_log                  -7.413e+01  1.297e+07   0.000    1.000
## BLUEBOOK_log                   6.547e+00  1.163e+06   0.000    1.000
## TIF_log                       -1.365e+02  8.929e+06   0.000    1.000
## OLDCLAIM_log                   1.928e+01  7.313e+05   0.000    1.000
## MVR_PTS_log                    1.010e+00  2.415e+06   0.000    1.000
## CAR_AGE_log                    8.634e-01  1.316e+05   0.000    1.000
## TARGET_AMT_sqrt                2.433e+00  6.768e+03   0.000    1.000
## AGE_sqrt                      -1.714e+03  1.527e+08   0.000    1.000
## YOJ_sqrt                       1.420e+01  1.802e+06   0.000    1.000
## INCOME_sqrt                    4.405e-05  2.644e+02   0.000    1.000
## HOME_VAL_sqrt                 -7.347e-02  6.282e+03   0.000    1.000
## TRAVTIME_sqrt                 -7.932e+01  1.377e+07   0.000    1.000
## BLUEBOOK_sqrt                  3.858e-01  9.554e+04   0.000    1.000
## TIF_sqrt                      -1.129e+02  5.309e+06   0.000    1.000
## OLDCLAIM_sqrt                  1.650e+00  6.927e+04   0.000    1.000
## MVR_PTS_sqrt                  -1.493e+00  5.256e+06   0.000    1.000
## CAR_AGE_sqrt                   1.435e-01  3.475e+05   0.000    1.000
## TARGET_AMT_cbrt               -2.048e+01  4.717e+04   0.000    1.000
## AGE_cbrt                       5.556e+03  5.039e+08   0.000    1.000
## YOJ_cbrt                      -7.775e+00  8.697e+05   0.000    1.000
## INCOME_cbrt                    4.954e-02  4.126e+03   0.000    1.000
## HOME_VAL_cbrt                  6.556e-01  5.847e+04   0.000    1.000
## TRAVTIME_cbrt                  2.584e+02  4.501e+07   0.000    1.000
## BLUEBOOK_cbrt                 -3.421e+00  7.494e+05   0.000    1.000
## TIF_cbrt                       4.177e+02  2.342e+07   0.000    1.000
## OLDCLAIM_cbrt                 -1.305e+01  5.231e+05   0.000    1.000
## MVR_PTS_cbrt                   5.135e-01  3.185e+06   0.000    1.000
## CAR_AGE_cbrt                  -1.028e+00  4.522e+05   0.000    1.000
## HV_INC_RATIO                   3.968e-05  7.142e+01   0.000    1.000
## TRT_MVR_PRODUCT               -1.001e-04  1.161e+02   0.000    1.000
## CAR_AGE_factModerate          -1.707e-01  3.880e+04   0.000    1.000
## CAR_AGE_factOld               -2.867e-01  4.299e+04   0.000    1.000
## HOME_VAL_factModerate          4.997e-01  1.898e+04   0.000    1.000
## HOME_VAL_factHigh              7.148e-01  3.213e+04   0.000    1.000
## INCOME_factModerate           -2.264e-01  1.950e+04   0.000    1.000
## INCOME_factHigh               -3.339e-01  2.860e+04   0.000    1.000
## MVR_PTS_factModerate          -9.545e-02  5.133e+04   0.000    1.000
## MVR_PTS_factHigh              -9.431e-02  6.485e+04   0.000    1.000
## OLDCLAIM_factHigh              1.318e+00  2.702e+04   0.000    1.000
## TIF_factModerate              -7.702e+00  4.320e+05   0.000    1.000
## TIF_factHigh                  -7.410e+00  4.342e+05   0.000    1.000
## TRAVTIME_factModerate          1.004e-01  2.075e+04   0.000    1.000
## TRAVTIME_factLong             -1.945e-02  2.927e+04   0.000    1.000
## YOJ_factModerate              -4.566e-01  1.960e+04   0.000    1.000
## YOJ_factHigh                  -3.441e-01  3.000e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9.4180e+03  on 8160  degrees of freedom
## Residual deviance: 5.1235e-08  on 8062  degrees of freedom
## AIC: 198
## 
## Number of Fisher Scoring iterations: 25
plot(model2)

Generate confidence intervals for regression slope

confint.default(model2)
##                                       2.5 %       97.5 %
## (Intercept)                   -5.118488e+08 5.118421e+08
## KIDSDRIV1                     -3.478957e+04 3.478957e+04
## KIDSDRIV2                     -4.733155e+04 4.733156e+04
## KIDSDRIV3                     -8.102270e+04 8.102340e+04
## KIDSDRIV4                     -4.132803e+05 4.132816e+05
## AGE                           -2.608372e+06 2.608404e+06
## HOMEKIDS1                     -3.321357e+04 3.321303e+04
## HOMEKIDS2                     -3.276100e+04 3.276046e+04
## HOMEKIDS3                     -3.851237e+04 3.851188e+04
## HOMEKIDS4                     -6.349851e+04 6.349805e+04
## HOMEKIDS5                     -1.672962e+05 1.672968e+05
## YOJ                           -2.376768e+05 2.376748e+05
## INCOME                        -1.139812e+00 1.139806e+00
## PARENT1Yes                    -3.467866e+04 3.467883e+04
## HOME_VAL                      -3.296746e+00 3.296783e+00
## MSTATUSYes                    -2.447699e+04 2.447750e+04
## SEXM                          -2.838324e+04 2.838375e+04
## EDUCATIONBachelors            -3.439030e+04 3.438989e+04
## EDUCATIONHigh School          -2.710557e+04 2.710593e+04
## EDUCATIONMasters              -4.906690e+04 4.906670e+04
## EDUCATIONPhD                  -5.819732e+04 5.819714e+04
## JOBBlue Collar                -4.914178e+04 4.914249e+04
## JOBClerical                   -5.290370e+04 5.290395e+04
## JOBDoctor                     -6.368393e+04 6.368411e+04
## JOBHome Maker                 -6.049254e+04 6.049307e+04
## JOBLawyer                     -4.537197e+04 4.537220e+04
## JOBManager                    -4.396173e+04 4.396195e+04
## JOBProfessional               -4.772949e+04 4.772978e+04
## JOBStudent                    -6.425414e+04 6.425464e+04
## TRAVTIME                      -2.391640e+05 2.391654e+05
## CAR_USEPrivate                -2.473323e+04 2.473327e+04
## BLUEBOOK                      -1.105181e+02 1.105178e+02
## TIF                           -3.111941e+05 3.111946e+05
## CAR_TYPEPanel Truck           -4.472050e+04 4.472065e+04
## CAR_TYPEPickup                -2.554847e+04 2.554906e+04
## CAR_TYPESports Car            -3.532592e+04 3.532583e+04
## CAR_TYPESUV                   -2.919573e+04 2.919563e+04
## CAR_TYPEVan                   -3.389959e+04 3.389961e+04
## RED_CARyes                    -2.228563e+04 2.228490e+04
## OLDCLAIM                      -8.651269e+01 8.651092e+01
## CLM_FREQ.L                    -2.321779e+06 2.321715e+06
## CLM_FREQ.Q                    -2.121782e+06 2.121841e+06
## CLM_FREQ.C                    -1.448711e+06 1.448671e+06
## CLM_FREQ^4                    -7.344076e+05 7.344286e+05
## CLM_FREQ^5                    -2.468403e+05 2.468344e+05
## REVOKEDYes                    -2.801319e+04 2.801328e+04
## MVR_PTS                       -7.912663e+05 7.912665e+05
## CAR_AGE                       -3.958450e+04 3.958448e+04
## URBANICITYHighly Urban/ Urban -2.258383e+04 2.258394e+04
## TARGET_AMT_log                -7.551677e+04 7.557108e+04
## AGE_log                       -2.910554e+08 2.910522e+08
## YOJ_log                       -2.206132e+06 2.206117e+06
## INCOME_log                    -2.275175e+04 2.275147e+04
## HOME_VAL_log                  -1.529913e+05 1.529897e+05
## TRAVTIME_log                  -2.541928e+07 2.541913e+07
## BLUEBOOK_log                  -2.280009e+06 2.280022e+06
## TIF_log                       -1.749980e+07 1.749952e+07
## OLDCLAIM_log                  -1.433342e+06 1.433380e+06
## MVR_PTS_log                   -4.734282e+06 4.734284e+06
## CAR_AGE_log                   -2.578416e+05 2.578433e+05
## TARGET_AMT_sqrt               -1.326338e+04 1.326825e+04
## AGE_sqrt                      -2.992105e+08 2.992071e+08
## YOJ_sqrt                      -3.531921e+06 3.531949e+06
## INCOME_sqrt                   -5.182162e+02 5.182163e+02
## HOME_VAL_sqrt                 -1.231321e+04 1.231306e+04
## TRAVTIME_sqrt                 -2.699100e+07 2.699084e+07
## BLUEBOOK_sqrt                 -1.872507e+05 1.872515e+05
## TIF_sqrt                      -1.040508e+07 1.040485e+07
## OLDCLAIM_sqrt                 -1.357719e+05 1.357752e+05
## MVR_PTS_sqrt                  -1.030107e+07 1.030106e+07
## CAR_AGE_sqrt                  -6.810229e+05 6.810232e+05
## TARGET_AMT_cbrt               -9.246969e+04 9.242873e+04
## AGE_cbrt                      -9.875794e+08 9.875906e+08
## YOJ_cbrt                      -1.704594e+06 1.704579e+06
## INCOME_cbrt                   -8.087103e+03 8.087202e+03
## HOME_VAL_cbrt                 -1.146009e+05 1.146022e+05
## TRAVTIME_cbrt                 -8.821839e+07 8.821890e+07
## BLUEBOOK_cbrt                 -1.468814e+06 1.468807e+06
## TIF_cbrt                      -4.590341e+07 4.590424e+07
## OLDCLAIM_cbrt                 -1.025269e+06 1.025243e+06
## MVR_PTS_cbrt                  -6.242116e+06 6.242117e+06
## CAR_AGE_cbrt                  -8.863783e+05 8.863763e+05
## HV_INC_RATIO                  -1.399792e+02 1.399792e+02
## TRT_MVR_PRODUCT               -2.275928e+02 2.275926e+02
## CAR_AGE_factModerate          -7.604626e+04 7.604592e+04
## CAR_AGE_factOld               -8.426315e+04 8.426257e+04
## HOME_VAL_factModerate         -3.719063e+04 3.719163e+04
## HOME_VAL_factHigh             -6.296905e+04 6.297048e+04
## INCOME_factModerate           -3.821705e+04 3.821659e+04
## INCOME_factHigh               -5.606057e+04 5.605990e+04
## MVR_PTS_factModerate          -1.006057e+05 1.006055e+05
## MVR_PTS_factHigh              -1.270942e+05 1.270940e+05
## OLDCLAIM_factHigh             -5.294944e+04 5.295208e+04
## TIF_factModerate              -8.466392e+05 8.466238e+05
## TIF_factHigh                  -8.510064e+05 8.509916e+05
## TRAVTIME_factModerate         -4.067301e+04 4.067321e+04
## TRAVTIME_factLong             -5.736656e+04 5.736652e+04
## YOJ_factModerate              -3.841196e+04 3.841105e+04
## YOJ_factHigh                  -5.879473e+04 5.879405e+04

Generate the odds ratios

exp(coef(model2))
##                   (Intercept)                     KIDSDRIV1 
##                  0.000000e+00                  9.982670e-01 
##                     KIDSDRIV2                     KIDSDRIV3 
##                  1.001839e+00                  1.422628e+00 
##                     KIDSDRIV4                           AGE 
##                  1.873823e+00                  7.310367e+06 
##                     HOMEKIDS1                     HOMEKIDS2 
##                  7.665136e-01                  7.629366e-01 
##                     HOMEKIDS3                     HOMEKIDS4 
##                  7.842160e-01                  7.972063e-01 
##                     HOMEKIDS5                           YOJ 
##                  1.362944e+00                  3.693247e-01 
##                        INCOME                    PARENT1Yes 
##                  9.999973e-01                  1.085997e+00 
##                      HOME_VAL                    MSTATUSYes 
##                  1.000018e+00                  1.285141e+00 
##                          SEXM            EDUCATIONBachelors 
##                  1.294976e+00                  8.169862e-01 
##          EDUCATIONHigh School              EDUCATIONMasters 
##                  1.198809e+00                  9.057728e-01 
##                  EDUCATIONPhD                JOBBlue Collar 
##                  9.099414e-01                  1.429742e+00 
##                   JOBClerical                     JOBDoctor 
##                  1.137944e+00                  1.093205e+00 
##                 JOBHome Maker                     JOBLawyer 
##                  1.304605e+00                  1.120785e+00 
##                    JOBManager               JOBProfessional 
##                  1.111590e+00                  1.155212e+00 
##                    JOBStudent                      TRAVTIME 
##                  1.282826e+00                  2.041484e+00 
##                CAR_USEPrivate                      BLUEBOOK 
##                  1.017361e+00                  9.998484e-01 
##                           TIF           CAR_TYPEPanel Truck 
##                  1.263384e+00                  1.076708e+00 
##                CAR_TYPEPickup            CAR_TYPESports Car 
##                  1.341014e+00                  9.600537e-01 
##                   CAR_TYPESUV                   CAR_TYPEVan 
##                  9.527057e-01                  1.007577e+00 
##                    RED_CARyes                      OLDCLAIM 
##                  6.950107e-01                  9.991185e-01 
##                    CLM_FREQ.L                    CLM_FREQ.Q 
##                  1.282684e-14                  5.428250e+12 
##                    CLM_FREQ.C                    CLM_FREQ^4 
##                  1.922168e-09                  3.694842e+04 
##                    CLM_FREQ^5                    REVOKEDYes 
##                  5.281858e-02                  1.042795e+00 
##                       MVR_PTS                       CAR_AGE 
##                  1.094420e+00                  9.917648e-01 
## URBANICITYHighly Urban/ Urban                TARGET_AMT_log 
##                  1.055209e+00                  6.238277e+11 
##                       AGE_log                       YOJ_log 
##                  0.000000e+00                  6.435035e-04 
##                    INCOME_log                  HOME_VAL_log 
##                  8.690197e-01                  4.581413e-01 
##                  TRAVTIME_log                  BLUEBOOK_log 
##                  6.391237e-33                  6.974428e+02 
##                       TIF_log                  OLDCLAIM_log 
##                  5.272341e-60                  2.369477e+08 
##                   MVR_PTS_log                   CAR_AGE_log 
##                  2.746150e+00                  2.371163e+00 
##               TARGET_AMT_sqrt                      AGE_sqrt 
##                  1.139636e+01                  0.000000e+00 
##                      YOJ_sqrt                   INCOME_sqrt 
##                  1.465582e+06                  1.000044e+00 
##                 HOME_VAL_sqrt                 TRAVTIME_sqrt 
##                  9.291674e-01                  3.570723e-35 
##                 BLUEBOOK_sqrt                      TIF_sqrt 
##                  1.470762e+00                  9.062180e-50 
##                 OLDCLAIM_sqrt                  MVR_PTS_sqrt 
##                  5.205367e+00                  2.246518e-01 
##                  CAR_AGE_sqrt               TARGET_AMT_cbrt 
##                  1.154281e+00                  1.271050e-09 
##                      AGE_cbrt                      YOJ_cbrt 
##                           Inf                  4.202583e-04 
##                   INCOME_cbrt                 HOME_VAL_cbrt 
##                  1.050791e+00                  1.926318e+00 
##                 TRAVTIME_cbrt                 BLUEBOOK_cbrt 
##                 1.623924e+112                  3.267791e-02 
##                      TIF_cbrt                 OLDCLAIM_cbrt 
##                 2.493477e+181                  2.139798e-06 
##                  MVR_PTS_cbrt                  CAR_AGE_cbrt 
##                  1.671168e+00                  3.575736e-01 
##                  HV_INC_RATIO               TRT_MVR_PRODUCT 
##                  1.000040e+00                  9.998999e-01 
##          CAR_AGE_factModerate               CAR_AGE_factOld 
##                  8.431078e-01                  7.507079e-01 
##         HOME_VAL_factModerate             HOME_VAL_factHigh 
##                  1.648253e+00                  2.043822e+00 
##           INCOME_factModerate               INCOME_factHigh 
##                  7.974285e-01                  7.160932e-01 
##          MVR_PTS_factModerate              MVR_PTS_factHigh 
##                  9.089606e-01                  9.100028e-01 
##             OLDCLAIM_factHigh              TIF_factModerate 
##                  3.736112e+00                  4.520045e-04 
##                  TIF_factHigh         TRAVTIME_factModerate 
##                  6.048732e-04                  1.105615e+00 
##             TRAVTIME_factLong              YOJ_factModerate 
##                  9.807391e-01                  6.334102e-01 
##                  YOJ_factHigh 
##                  7.088489e-01

Generate confidence intervals for regression slope

confint.default(model2)
##                                       2.5 %       97.5 %
## (Intercept)                   -5.118488e+08 5.118421e+08
## KIDSDRIV1                     -3.478957e+04 3.478957e+04
## KIDSDRIV2                     -4.733155e+04 4.733156e+04
## KIDSDRIV3                     -8.102270e+04 8.102340e+04
## KIDSDRIV4                     -4.132803e+05 4.132816e+05
## AGE                           -2.608372e+06 2.608404e+06
## HOMEKIDS1                     -3.321357e+04 3.321303e+04
## HOMEKIDS2                     -3.276100e+04 3.276046e+04
## HOMEKIDS3                     -3.851237e+04 3.851188e+04
## HOMEKIDS4                     -6.349851e+04 6.349805e+04
## HOMEKIDS5                     -1.672962e+05 1.672968e+05
## YOJ                           -2.376768e+05 2.376748e+05
## INCOME                        -1.139812e+00 1.139806e+00
## PARENT1Yes                    -3.467866e+04 3.467883e+04
## HOME_VAL                      -3.296746e+00 3.296783e+00
## MSTATUSYes                    -2.447699e+04 2.447750e+04
## SEXM                          -2.838324e+04 2.838375e+04
## EDUCATIONBachelors            -3.439030e+04 3.438989e+04
## EDUCATIONHigh School          -2.710557e+04 2.710593e+04
## EDUCATIONMasters              -4.906690e+04 4.906670e+04
## EDUCATIONPhD                  -5.819732e+04 5.819714e+04
## JOBBlue Collar                -4.914178e+04 4.914249e+04
## JOBClerical                   -5.290370e+04 5.290395e+04
## JOBDoctor                     -6.368393e+04 6.368411e+04
## JOBHome Maker                 -6.049254e+04 6.049307e+04
## JOBLawyer                     -4.537197e+04 4.537220e+04
## JOBManager                    -4.396173e+04 4.396195e+04
## JOBProfessional               -4.772949e+04 4.772978e+04
## JOBStudent                    -6.425414e+04 6.425464e+04
## TRAVTIME                      -2.391640e+05 2.391654e+05
## CAR_USEPrivate                -2.473323e+04 2.473327e+04
## BLUEBOOK                      -1.105181e+02 1.105178e+02
## TIF                           -3.111941e+05 3.111946e+05
## CAR_TYPEPanel Truck           -4.472050e+04 4.472065e+04
## CAR_TYPEPickup                -2.554847e+04 2.554906e+04
## CAR_TYPESports Car            -3.532592e+04 3.532583e+04
## CAR_TYPESUV                   -2.919573e+04 2.919563e+04
## CAR_TYPEVan                   -3.389959e+04 3.389961e+04
## RED_CARyes                    -2.228563e+04 2.228490e+04
## OLDCLAIM                      -8.651269e+01 8.651092e+01
## CLM_FREQ.L                    -2.321779e+06 2.321715e+06
## CLM_FREQ.Q                    -2.121782e+06 2.121841e+06
## CLM_FREQ.C                    -1.448711e+06 1.448671e+06
## CLM_FREQ^4                    -7.344076e+05 7.344286e+05
## CLM_FREQ^5                    -2.468403e+05 2.468344e+05
## REVOKEDYes                    -2.801319e+04 2.801328e+04
## MVR_PTS                       -7.912663e+05 7.912665e+05
## CAR_AGE                       -3.958450e+04 3.958448e+04
## URBANICITYHighly Urban/ Urban -2.258383e+04 2.258394e+04
## TARGET_AMT_log                -7.551677e+04 7.557108e+04
## AGE_log                       -2.910554e+08 2.910522e+08
## YOJ_log                       -2.206132e+06 2.206117e+06
## INCOME_log                    -2.275175e+04 2.275147e+04
## HOME_VAL_log                  -1.529913e+05 1.529897e+05
## TRAVTIME_log                  -2.541928e+07 2.541913e+07
## BLUEBOOK_log                  -2.280009e+06 2.280022e+06
## TIF_log                       -1.749980e+07 1.749952e+07
## OLDCLAIM_log                  -1.433342e+06 1.433380e+06
## MVR_PTS_log                   -4.734282e+06 4.734284e+06
## CAR_AGE_log                   -2.578416e+05 2.578433e+05
## TARGET_AMT_sqrt               -1.326338e+04 1.326825e+04
## AGE_sqrt                      -2.992105e+08 2.992071e+08
## YOJ_sqrt                      -3.531921e+06 3.531949e+06
## INCOME_sqrt                   -5.182162e+02 5.182163e+02
## HOME_VAL_sqrt                 -1.231321e+04 1.231306e+04
## TRAVTIME_sqrt                 -2.699100e+07 2.699084e+07
## BLUEBOOK_sqrt                 -1.872507e+05 1.872515e+05
## TIF_sqrt                      -1.040508e+07 1.040485e+07
## OLDCLAIM_sqrt                 -1.357719e+05 1.357752e+05
## MVR_PTS_sqrt                  -1.030107e+07 1.030106e+07
## CAR_AGE_sqrt                  -6.810229e+05 6.810232e+05
## TARGET_AMT_cbrt               -9.246969e+04 9.242873e+04
## AGE_cbrt                      -9.875794e+08 9.875906e+08
## YOJ_cbrt                      -1.704594e+06 1.704579e+06
## INCOME_cbrt                   -8.087103e+03 8.087202e+03
## HOME_VAL_cbrt                 -1.146009e+05 1.146022e+05
## TRAVTIME_cbrt                 -8.821839e+07 8.821890e+07
## BLUEBOOK_cbrt                 -1.468814e+06 1.468807e+06
## TIF_cbrt                      -4.590341e+07 4.590424e+07
## OLDCLAIM_cbrt                 -1.025269e+06 1.025243e+06
## MVR_PTS_cbrt                  -6.242116e+06 6.242117e+06
## CAR_AGE_cbrt                  -8.863783e+05 8.863763e+05
## HV_INC_RATIO                  -1.399792e+02 1.399792e+02
## TRT_MVR_PRODUCT               -2.275928e+02 2.275926e+02
## CAR_AGE_factModerate          -7.604626e+04 7.604592e+04
## CAR_AGE_factOld               -8.426315e+04 8.426257e+04
## HOME_VAL_factModerate         -3.719063e+04 3.719163e+04
## HOME_VAL_factHigh             -6.296905e+04 6.297048e+04
## INCOME_factModerate           -3.821705e+04 3.821659e+04
## INCOME_factHigh               -5.606057e+04 5.605990e+04
## MVR_PTS_factModerate          -1.006057e+05 1.006055e+05
## MVR_PTS_factHigh              -1.270942e+05 1.270940e+05
## OLDCLAIM_factHigh             -5.294944e+04 5.295208e+04
## TIF_factModerate              -8.466392e+05 8.466238e+05
## TIF_factHigh                  -8.510064e+05 8.509916e+05
## TRAVTIME_factModerate         -4.067301e+04 4.067321e+04
## TRAVTIME_factLong             -5.736656e+04 5.736652e+04
## YOJ_factModerate              -3.841196e+04 3.841105e+04
## YOJ_factHigh                  -5.879473e+04 5.879405e+04

Generate the odds ratios

exp(coef(model2))
##                   (Intercept)                     KIDSDRIV1 
##                  0.000000e+00                  9.982670e-01 
##                     KIDSDRIV2                     KIDSDRIV3 
##                  1.001839e+00                  1.422628e+00 
##                     KIDSDRIV4                           AGE 
##                  1.873823e+00                  7.310367e+06 
##                     HOMEKIDS1                     HOMEKIDS2 
##                  7.665136e-01                  7.629366e-01 
##                     HOMEKIDS3                     HOMEKIDS4 
##                  7.842160e-01                  7.972063e-01 
##                     HOMEKIDS5                           YOJ 
##                  1.362944e+00                  3.693247e-01 
##                        INCOME                    PARENT1Yes 
##                  9.999973e-01                  1.085997e+00 
##                      HOME_VAL                    MSTATUSYes 
##                  1.000018e+00                  1.285141e+00 
##                          SEXM            EDUCATIONBachelors 
##                  1.294976e+00                  8.169862e-01 
##          EDUCATIONHigh School              EDUCATIONMasters 
##                  1.198809e+00                  9.057728e-01 
##                  EDUCATIONPhD                JOBBlue Collar 
##                  9.099414e-01                  1.429742e+00 
##                   JOBClerical                     JOBDoctor 
##                  1.137944e+00                  1.093205e+00 
##                 JOBHome Maker                     JOBLawyer 
##                  1.304605e+00                  1.120785e+00 
##                    JOBManager               JOBProfessional 
##                  1.111590e+00                  1.155212e+00 
##                    JOBStudent                      TRAVTIME 
##                  1.282826e+00                  2.041484e+00 
##                CAR_USEPrivate                      BLUEBOOK 
##                  1.017361e+00                  9.998484e-01 
##                           TIF           CAR_TYPEPanel Truck 
##                  1.263384e+00                  1.076708e+00 
##                CAR_TYPEPickup            CAR_TYPESports Car 
##                  1.341014e+00                  9.600537e-01 
##                   CAR_TYPESUV                   CAR_TYPEVan 
##                  9.527057e-01                  1.007577e+00 
##                    RED_CARyes                      OLDCLAIM 
##                  6.950107e-01                  9.991185e-01 
##                    CLM_FREQ.L                    CLM_FREQ.Q 
##                  1.282684e-14                  5.428250e+12 
##                    CLM_FREQ.C                    CLM_FREQ^4 
##                  1.922168e-09                  3.694842e+04 
##                    CLM_FREQ^5                    REVOKEDYes 
##                  5.281858e-02                  1.042795e+00 
##                       MVR_PTS                       CAR_AGE 
##                  1.094420e+00                  9.917648e-01 
## URBANICITYHighly Urban/ Urban                TARGET_AMT_log 
##                  1.055209e+00                  6.238277e+11 
##                       AGE_log                       YOJ_log 
##                  0.000000e+00                  6.435035e-04 
##                    INCOME_log                  HOME_VAL_log 
##                  8.690197e-01                  4.581413e-01 
##                  TRAVTIME_log                  BLUEBOOK_log 
##                  6.391237e-33                  6.974428e+02 
##                       TIF_log                  OLDCLAIM_log 
##                  5.272341e-60                  2.369477e+08 
##                   MVR_PTS_log                   CAR_AGE_log 
##                  2.746150e+00                  2.371163e+00 
##               TARGET_AMT_sqrt                      AGE_sqrt 
##                  1.139636e+01                  0.000000e+00 
##                      YOJ_sqrt                   INCOME_sqrt 
##                  1.465582e+06                  1.000044e+00 
##                 HOME_VAL_sqrt                 TRAVTIME_sqrt 
##                  9.291674e-01                  3.570723e-35 
##                 BLUEBOOK_sqrt                      TIF_sqrt 
##                  1.470762e+00                  9.062180e-50 
##                 OLDCLAIM_sqrt                  MVR_PTS_sqrt 
##                  5.205367e+00                  2.246518e-01 
##                  CAR_AGE_sqrt               TARGET_AMT_cbrt 
##                  1.154281e+00                  1.271050e-09 
##                      AGE_cbrt                      YOJ_cbrt 
##                           Inf                  4.202583e-04 
##                   INCOME_cbrt                 HOME_VAL_cbrt 
##                  1.050791e+00                  1.926318e+00 
##                 TRAVTIME_cbrt                 BLUEBOOK_cbrt 
##                 1.623924e+112                  3.267791e-02 
##                      TIF_cbrt                 OLDCLAIM_cbrt 
##                 2.493477e+181                  2.139798e-06 
##                  MVR_PTS_cbrt                  CAR_AGE_cbrt 
##                  1.671168e+00                  3.575736e-01 
##                  HV_INC_RATIO               TRT_MVR_PRODUCT 
##                  1.000040e+00                  9.998999e-01 
##          CAR_AGE_factModerate               CAR_AGE_factOld 
##                  8.431078e-01                  7.507079e-01 
##         HOME_VAL_factModerate             HOME_VAL_factHigh 
##                  1.648253e+00                  2.043822e+00 
##           INCOME_factModerate               INCOME_factHigh 
##                  7.974285e-01                  7.160932e-01 
##          MVR_PTS_factModerate              MVR_PTS_factHigh 
##                  9.089606e-01                  9.100028e-01 
##             OLDCLAIM_factHigh              TIF_factModerate 
##                  3.736112e+00                  4.520045e-04 
##                  TIF_factHigh         TRAVTIME_factModerate 
##                  6.048732e-04                  1.105615e+00 
##             TRAVTIME_factLong              YOJ_factModerate 
##                  9.807391e-01                  6.334102e-01 
##                  YOJ_factHigh 
##                  7.088489e-01

I will select the variables using the stepwise method

The ‘stepAIC’ function in R performs a stepwise model selection with an objective to minimize the AIC value.

Using StepWise in both direction

stepWise_b <- stepAIC(model1, scope = list(upper=model2),direction="both")
## Start:  AIC=9419.96
## TARGET_FLAG ~ 1
## 
##                   Df Deviance    AIC
## + TARGET_AMT_log   1      0.0    4.0
## + TARGET_AMT_cbrt  1      0.0    4.0
## + TARGET_AMT_sqrt  1      0.0    4.0
## + URBANICITY       1   8915.9 8919.9
## + CLM_FREQ         5   8948.2 8960.2
## + OLDCLAIM_log     1   8959.2 8963.2
## + OLDCLAIM_cbrt    1   9035.1 9039.1
## + MVR_PTS          1   9049.6 9053.6
## + TRT_MVR_PRODUCT  1   9083.1 9087.1
## + MVR_PTS_sqrt     1   9092.5 9096.5
## + MVR_PTS_log      1   9093.3 9097.3
## + OLDCLAIM_sqrt    1   9111.6 9115.6
## + MVR_PTS_cbrt     1   9128.4 9132.4
## + HOME_VAL         1   9130.2 9134.2
## + MVR_PTS_fact     2   9133.3 9139.3
## + HOME_VAL_sqrt    1   9158.9 9162.9
## + JOB              8   9145.1 9163.1
## + HOME_VAL_cbrt    1   9184.1 9188.1
## + HOME_VAL_fact    2   9188.5 9194.5
## + OLDCLAIM_fact    1   9198.4 9202.4
## + PARENT1          1   9232.4 9236.4
## + HOME_VAL_log     1   9235.0 9239.0
## + INCOME_fact      2   9236.9 9242.9
## + INCOME           1   9240.0 9244.0
## + REVOKED          1   9245.9 9249.9
## + CAR_TYPE         5   9237.9 9249.9
## + INCOME_sqrt      1   9248.5 9252.5
## + EDUCATION        4   9246.5 9256.5
## + CAR_USE          1   9255.0 9259.0
## + MSTATUS          1   9270.8 9274.8
## + OLDCLAIM         1   9277.5 9281.5
## + HOMEKIDS         5   9283.6 9295.6
## + AGE_log          1   9306.7 9310.7
## + AGE_cbrt         1   9313.3 9317.3
## + BLUEBOOK_cbrt    1   9314.8 9318.8
## + BLUEBOOK_sqrt    1   9316.2 9320.2
## + AGE_sqrt         1   9317.1 9321.1
## + BLUEBOOK_log     1   9317.7 9321.7
## + CAR_AGE          1   9325.0 9329.0
## + BLUEBOOK         1   9327.5 9331.5
## + CAR_AGE_fact     2   9326.3 9332.3
## + AGE              1   9328.8 9332.8
## + CAR_AGE_sqrt     1   9330.3 9334.3
## + CAR_AGE_cbrt     1   9334.0 9338.0
## + CAR_AGE_log      1   9335.3 9339.3
## + KIDSDRIV         4   9331.9 9341.9
## + TIF_sqrt         1   9359.1 9363.1
## + TIF_cbrt         1   9359.7 9363.7
## + TIF_log          1   9360.3 9364.3
## + TIF              1   9360.8 9364.8
## + YOJ_cbrt         1   9364.1 9368.1
## + YOJ_log          1   9365.6 9369.6
## + YOJ_sqrt         1   9366.5 9370.5
## + TIF_fact         2   9368.2 9374.2
## + YOJ              1   9375.7 9379.7
## + TRAVTIME_log     1   9390.2 9394.2
## + TRAVTIME_cbrt    1   9392.1 9396.1
## + TRAVTIME_sqrt    1   9393.6 9397.6
## + TRAVTIME_fact    2   9392.8 9398.8
## + YOJ_fact         2   9394.0 9400.0
## + TRAVTIME         1   9399.0 9403.0
## + INCOME_cbrt      1   9412.0 9416.0
## + SEX              1   9414.3 9418.3
## <none>                 9418.0 9420.0
## + INCOME_log       1   9416.5 9420.5
## + HV_INC_RATIO     1   9417.5 9421.5
## + RED_CAR          1   9417.6 9421.6
## 
## Step:  AIC=4
## TARGET_FLAG ~ TARGET_AMT_log
## 
##                   Df Deviance  AIC
## <none>                      0    4
## + TARGET_AMT_cbrt  1        0    6
## + TARGET_AMT_sqrt  1        0    6
## + MVR_PTS          1        0    6
## + TRT_MVR_PRODUCT  1        0    6
## + MVR_PTS_log      1        0    6
## + MVR_PTS_sqrt     1        0    6
## + MVR_PTS_cbrt     1        0    6
## + PARENT1          1        0    6
## + REVOKED          1        0    6
## + BLUEBOOK         1        0    6
## + BLUEBOOK_sqrt    1        0    6
## + BLUEBOOK_cbrt    1        0    6
## + BLUEBOOK_log     1        0    6
## + MSTATUS          1        0    6
## + CAR_AGE          1        0    6
## + OLDCLAIM_log     1        0    6
## + CAR_AGE_sqrt     1        0    6
## + CAR_AGE_cbrt     1        0    6
## + CAR_AGE_log      1        0    6
## + INCOME           1        0    6
## + OLDCLAIM         1        0    6
## + INCOME_sqrt      1        0    6
## + OLDCLAIM_cbrt    1        0    6
## + OLDCLAIM_sqrt    1        0    6
## + SEX              1        0    6
## + RED_CAR          1        0    6
## + TIF_log          1        0    6
## + TIF_cbrt         1        0    6
## + YOJ_cbrt         1        0    6
## + YOJ_log          1        0    6
## + HOME_VAL_cbrt    1        0    6
## + TRAVTIME_sqrt    1        0    6
## + TRAVTIME         1        0    6
## + TRAVTIME_cbrt    1        0    6
## + INCOME_cbrt      1        0    6
## + HV_INC_RATIO     1        0    6
## + INCOME_log       1        0    6
## + TIF_sqrt         1        0    6
## + TRAVTIME_log     1        0    6
## + YOJ_sqrt         1        0    6
## + URBANICITY       1        0    6
## + TIF              1        0    6
## + HOME_VAL_sqrt    1        0    6
## + HOME_VAL         1        0    6
## + CAR_USE          1        0    6
## + OLDCLAIM_fact    1        0    6
## + AGE              1        0    6
## + YOJ              1        0    6
## + AGE_sqrt         1        0    6
## + AGE_cbrt         1        0    6
## + HOME_VAL_log     1        0    6
## + AGE_log          1        0    6
## + MVR_PTS_fact     2        0    8
## + TIF_fact         2        0    8
## + INCOME_fact      2        0    8
## + HOME_VAL_fact    2        0    8
## + CAR_AGE_fact     2        0    8
## + TRAVTIME_fact    2        0    8
## + YOJ_fact         2        0    8
## + EDUCATION        4        0   12
## + KIDSDRIV         4        0   12
## + CLM_FREQ         5        0   14
## + CAR_TYPE         5        0   14
## + HOMEKIDS         5        0   14
## + JOB              8        0   20
## - TARGET_AMT_log   1     9418 9420

Create histogram of the Probability

hist(stepWise_b$fitted.values, main = " Histogram ",xlab = "Probability", col = 'skyblue3')

Show the predicted values

df_ins$Predict <- ifelse(stepWise_b$fitted.values >0.5,"pos","neg")
head(df_ins$Predict)
## [1] "neg" "neg" "neg" "neg" "neg" "pos"