Data Exploration

Load the data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(dplyr)
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
insur_train <- read.csv("~/Desktop/insurance_training_data.csv", na.strings = c(""), stringsAsFactors = FALSE)
insur_eval <- read.csv("~/Desktop/insurance-evaluation-data.csv", na.strings = c(""), stringsAsFactors = FALSE)

Remove unnecessary variable

insur_train_clean <- insur_train[,-1] #index using base r to remove column
insur_eval_clean <- insur_eval[, -1]

Types of Variables

sapply(X=insur_train_clean,
       FUN= class)
## TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS         YOJ 
##   "integer"   "numeric"   "integer"   "integer"   "integer"   "integer" 
##      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX   EDUCATION 
## "character" "character" "character" "character" "character" "character" 
##         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF    CAR_TYPE 
## "character"   "integer" "character" "character"   "integer" "character" 
##     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS     CAR_AGE 
## "character" "character"   "integer" "character"   "integer"   "integer" 
##  URBANICITY 
## "character"

Show where n.as are

insur_train_clean%>%
    summarise(across(everything(),~ sum(is.na(.)))) %>%
    glimpse()
## Rows: 1
## Columns: 25
## $ TARGET_FLAG <int> 0
## $ TARGET_AMT  <int> 0
## $ KIDSDRIV    <int> 0
## $ AGE         <int> 6
## $ HOMEKIDS    <int> 0
## $ YOJ         <int> 454
## $ INCOME      <int> 445
## $ PARENT1     <int> 0
## $ HOME_VAL    <int> 464
## $ MSTATUS     <int> 0
## $ SEX         <int> 0
## $ EDUCATION   <int> 0
## $ JOB         <int> 526
## $ TRAVTIME    <int> 0
## $ CAR_USE     <int> 0
## $ BLUEBOOK    <int> 0
## $ TIF         <int> 0
## $ CAR_TYPE    <int> 0
## $ RED_CAR     <int> 0
## $ OLDCLAIM    <int> 0
## $ CLM_FREQ    <int> 0
## $ REVOKED     <int> 0
## $ MVR_PTS     <int> 0
## $ CAR_AGE     <int> 510
## $ URBANICITY  <int> 0

Remove rows with n.as

insur_train_clean <- na.omit(insur_train_clean)

N/A variables tend to be in the same rows. So I am omitting all of those rows with N/As.

Correlation

Correlation of some variables I thought might have been highly correlated.

cor(insur_train_clean$TARGET_FLAG, insur_train_clean$AGE)
## [1] -0.1152745
cor(insur_train_clean$TARGET_FLAG, insur_train_clean$CAR_AGE)
## [1] -0.1102527
cor(insur_train_clean$TARGET_AMT, insur_train_clean$CAR_AGE)
## [1] -0.06961347
cor(insur_train_clean$TARGET_AMT, insur_train_clean$AGE)
## [1] -0.05654628

Plots of Qualitative Variables

table(insur_train_clean$CAR_USE, insur_train_clean$TARGET_FLAG)  
##             
##                 0    1
##   Commercial 1297  743
##   Private    3146  859
ggplot(data=insur_train_clean, aes(x=CAR_USE, y= TARGET_FLAG)) + 
      geom_bar(stat= "identity", width = 0.5, fill= "steelblue") + 
      geom_text(aes(label=TARGET_FLAG)) + 
      labs( x = "Car Use",
            y = "Number of Crashes", 
           title = "Number of Crashes based on Type of Car Use")

There is not much of a difference in the amount of crashes commercial vehicles get in to versus the amount private cars get in to. However, it does make sense that commercial cars tend to get into less crashes. This is because they are probably getting paid to drive as part of their job, the cars are probably not theirs, and they might be driving long distances as part of their job. So these people may tend to be extra careful when driving.

table(insur_train_clean$EDUCATION, insur_train_clean$TARGET_FLAG)  
##                
##                    0    1
##   <High School   642  313
##   Bachelors     1338  402
##   Masters        881  180
##   PhD            345   72
##   z_High School 1237  635
ggplot(data=insur_train_clean, aes(x=EDUCATION, y= TARGET_FLAG)) + 
      geom_bar(stat= "identity", width = 0.5, fill= "steelblue") + 
      geom_text(aes(label=TARGET_FLAG)) + 
      labs(x= "Level of Education", 
           y= "Number of Crashes", 
          title= "Number of Crashes Based Level of Education")

Those who only have high school education probably do not have the typical 9-5 corporate job. They may be out on the roads more as part of their job. Or, they may just not have a job and have more time on their hands to drive around. Some of these people could also be right out of high school, when teenagers tend to be reckless. It makes sense that people with a PhD get in the least amount of crashes because they are probably working or studying most of their days.

table(insur_train_clean$JOB, insur_train_clean$TARGET_FLAG)  
##                
##                   0   1
##   Clerical      719 312
##   Doctor        173  27
##   Home Maker    351 133
##   Lawyer        543 127
##   Manager       680  99
##   Professional  678 190
##   Student       337 200
##   z_Blue Collar 962 514
    ggplot(data=insur_train_clean, aes(x=JOB, y= TARGET_FLAG)) + 
      geom_bar(stat= "identity", width = 0.5, fill= "steelblue") + 
      geom_text(aes(label=TARGET_FLAG)) + 
      labs(x= "Occupation", 
          y= "Number of Crashes", 
          title= "Number of Crashes Based on Occupation")

Someone who is a blue collar worker may have to drive as part of their job, as where a doctor or manager does not. Blue collar workers might be driving longer distances and their job hours may be shorter. Whereas a doctor is probably in the hospital for many hours a day and does not drive that often.

Data Preparation

removes dollar and makes the variable numeric

library(stringr)
insur_train_clean <- insur_train_clean %>% 
  mutate(INCOME=as.numeric(str_replace_all(INCOME,'\\$|,',''))) %>%
  mutate(HOME_VAL=as.numeric(str_replace_all(HOME_VAL,'\\$|,',''))) %>%
  mutate(BLUEBOOK=as.numeric(str_replace_all(BLUEBOOK,'\\$|,',''))) %>%
  mutate(OLDCLAIM=as.numeric(str_replace_all(OLDCLAIM,'\\$|,','')))

insur_eval_clean <- insur_eval_clean %>% 
  mutate(INCOME=as.numeric(str_replace_all(INCOME,'\\$|,',''))) %>%
  mutate(HOME_VAL=as.numeric(str_replace_all(HOME_VAL,'\\$|,',''))) %>%
  mutate(BLUEBOOK=as.numeric(str_replace_all(BLUEBOOK,'\\$|,',''))) %>%
  mutate(OLDCLAIM=as.numeric(str_replace_all(OLDCLAIM,'\\$|,','')))

Dummy Variables -

Dummy variables are either 0 or 1. You use dummy variables when the answer is yes/no. The five variables below, that I made dummy variables, are those that had a yes/no answer. The benefit of using dummy variables is that it makes the variable numeric. These variables can now be shown in my summary stats.

library(fastDummies)
insur_train_clean.d <- dummy_cols(insur_train_clean, 
                                  select_columns = 
                                  c('PARENT1', 'MSTATUS',
                                    'RED_CAR', 'REVOKED', 'URBANICITY'), 
                                  remove_selected_columns = TRUE)

insur_eval_clean.d <- dummy_cols(insur_train_clean, 
                                  select_columns = 
                                  c('PARENT1', 'MSTATUS',
                                    'RED_CAR', 'REVOKED', 'URBANICITY'), 
                                  remove_selected_columns = TRUE)
labels <- c(
  'Crash', 
  'Cost (if crash occured)',
  'Number of Kids Driving',
  'Age',
  'Number of Kids at Home',
  'Years on Job',
  'Income',
  'Home Value',
  'Distance to Work',
  'Value of Vehicle',
  'Time in Force',
  'Total Old Claims',
  'Claims Filed', 
  'Motor Vehicle Record Points',
  'Vehicle Age',
  'Not a Single Parent',
  'Single Parent',
  'Married',
  'Not Married',
  'Not a Red Car',
  'Red Car',
  'License not Revoked',
  'License Revoked',
  'Urban Area',
  'Rural Area')

stargazer(insur_train_clean.d, 
          type = "text", # html, latex
          # out =
          # summary.stat = 
          covariate.labels = labels,
          digits = 2)
## 
## =======================================================================
## Statistic                     N      Mean     St. Dev.   Min     Max   
## -----------------------------------------------------------------------
## Crash                       6,045    0.27       0.44      0       1    
## Cost (if crash occured)     6,045  1,479.66   4,553.17  0.00  85,523.65
## Number of Kids Driving      6,045    0.17       0.52      0       4    
## Age                         6,045   44.63       8.71     16      81    
## Number of Kids at Home      6,045    0.74       1.13      0       5    
## Years on Job                6,045   10.49       4.14      0      23    
## Income                      6,045 58,177.01  43,826.98    0    367,030 
## Home Value                  6,045 150,102.10 123,728.70   0    885,282 
## Distance to Work            6,045   33.69      15.89      5      142   
## Value of Vehicle            6,045 15,235.61   8,040.96  1,500  65,970  
## Time in Force               6,045    5.36       4.14      1      25    
## Total Old Claims            6,045  4,004.88   8,822.51    0    57,037  
## Claims Filed                6,045    0.78       1.15      0       5    
## Motor Vehicle Record Points 6,045    1.70       2.16      0      13    
## Vehicle Age                 6,045    7.92       5.58     -3      28    
## Not a Single Parent         6,045    0.86       0.34      0       1    
## Single Parent               6,045    0.14       0.34      0       1    
## Married                     6,045    0.60       0.49      0       1    
## Not Married                 6,045    0.40       0.49      0       1    
## Not a Red Car               6,045    0.72       0.45      0       1    
## Red Car                     6,045    0.28       0.45      0       1    
## License not Revoked         6,045    0.88       0.33      0       1    
## License Revoked             6,045    0.12       0.33      0       1    
## Urban Area                  6,045    0.78       0.41      0       1    
## Rural Area                  6,045    0.22       0.41      0       1    
## -----------------------------------------------------------------------

I was interested to see that the mean age was around 45 years old. I wonder how the data would look if the mean age was older or younger. There would probably be more crashes if the mean age was younger (or even significantly older for that matter). The average amount of claims filed in the past five years for this population is .78, and the average motor vehicle record points is 1.70. This leads me to believe that this groups may be relatively safe drivers.

Multiple linear regression

Linear Model 1

c <- cor(insur_train_clean.d [, c(2,25:30)])
library(corrplot)
## corrplot 0.92 loaded
corrplot(c, type = "upper")

model1 <- lm(data = insur_train_clean.d,
             TARGET_AMT ~ . -RED_CAR_no -PARENT1_Yes -MSTATUS_z_No -REVOKED_Yes -`URBANICITY_z_Highly Rural/ Rural` -TARGET_FLAG)

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(model1)
##                                       GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV                          1.313594  1        1.146121
## AGE                               1.492547  1        1.221699
## HOMEKIDS                          2.112021  1        1.453279
## YOJ                               1.467479  1        1.211395
## INCOME                            3.150506  1        1.774966
## HOME_VAL                          2.454136  1        1.566568
## SEX                               3.221519  1        1.794859
## EDUCATION                        11.263320  4        1.353500
## JOB                              22.073019  7        1.247355
## TRAVTIME                          1.036178  1        1.017928
## CAR_USE                           2.353409  1        1.534082
## BLUEBOOK                          1.872914  1        1.368544
## TIF                               1.008654  1        1.004318
## CAR_TYPE                          4.501908  5        1.162357
## OLDCLAIM                          1.704900  1        1.305718
## CLM_FREQ                          1.607648  1        1.267931
## MVR_PTS                           1.236336  1        1.111907
## CAR_AGE                           2.046281  1        1.430483
## PARENT1_No                        1.856170  1        1.362413
## MSTATUS_Yes                       2.109977  1        1.452576
## RED_CAR_yes                       1.850907  1        1.360480
## REVOKED_No                        1.295934  1        1.138391
## `URBANICITY_Highly Urban/ Urban`  1.251364  1        1.118644

I eliminated those specific variables because when I would do summary stats, they would come up as n.a. So I ran a correlation of my data and saw that these variables were coming up as n.a because they were highly co-linear to its counterpart.

Linear Model 2

model2 <- lm(data = insur_train_clean.d,
             TARGET_AMT ~ . -RED_CAR_no -PARENT1_Yes -MSTATUS_z_No -REVOKED_Yes -`URBANICITY_z_Highly Rural/ Rural` -EDUCATION -JOB -TARGET_FLAG)

Eliminated education and job because of multi-colinearity

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
model3 <- stepAIC(object = lm(data = insur_train_clean.d, 
                            TARGET_AMT ~ . -TARGET_FLAG), 
                            direction = c("backward")
                            ) #used backward selection
## Start:  AIC=101432.9
## TARGET_AMT ~ (TARGET_FLAG + KIDSDRIV + AGE + HOMEKIDS + YOJ + 
##     INCOME + HOME_VAL + SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + 
##     BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + 
##     CAR_AGE + PARENT1_No + PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + 
##     RED_CAR_no + RED_CAR_yes + REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban` + 
##     `URBANICITY_z_Highly Rural/ Rural`) - TARGET_FLAG
## 
## 
## Step:  AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes + 
##     REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes + 
##     REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + REVOKED_No + 
##     `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - YOJ                               1     351556 1.1571e+11 101431
## - HOMEKIDS                          1     896426 1.1571e+11 101431
## - AGE                               1    5868923 1.1572e+11 101431
## - OLDCLAIM                          1    6544464 1.1572e+11 101431
## - RED_CAR_no                        1   18824464 1.1573e+11 101432
## - INCOME                            1   31562132 1.1575e+11 101433
## - CLM_FREQ                          1   31640073 1.1575e+11 101433
## - BLUEBOOK                          1   33101062 1.1575e+11 101433
## <none>                                           1.1571e+11 101433
## - HOME_VAL                          1   41502176 1.1576e+11 101433
## - EDUCATION                         4  161597962 1.1588e+11 101433
## - KIDSDRIV                          1   52170833 1.1577e+11 101434
## - CAR_AGE                           1   59033415 1.1577e+11 101434
## - PARENT1_No                        1   84127414 1.1580e+11 101435
## - SEX                               1  102980401 1.1582e+11 101436
## - REVOKED_No                        1  124659787 1.1584e+11 101437
## - TRAVTIME                          1  173886142 1.1589e+11 101440
## - MSTATUS_Yes                       1  202041858 1.1592e+11 101441
## - TIF                               1  204543749 1.1592e+11 101442
## - CAR_USE                           1  328063504 1.1604e+11 101448
## - JOB                               7  677813229 1.1639e+11 101454
## - CAR_TYPE                          5  701858743 1.1642e+11 101459
## - MVR_PTS                           1  679930613 1.1639e+11 101466
## - `URBANICITY_Highly Urban/ Urban`  1 2195481766 1.1791e+11 101545
## 
## Step:  AIC=101430.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - HOMEKIDS                          1     709162 1.1571e+11 101429
## - AGE                               1    6626898 1.1572e+11 101429
## - OLDCLAIM                          1    6628961 1.1572e+11 101429
## - RED_CAR_no                        1   18852982 1.1573e+11 101430
## - CLM_FREQ                          1   31758506 1.1575e+11 101431
## - INCOME                            1   32314466 1.1575e+11 101431
## - BLUEBOOK                          1   33080536 1.1575e+11 101431
## <none>                                           1.1571e+11 101431
## - HOME_VAL                          1   41487801 1.1576e+11 101431
## - EDUCATION                         4  161967684 1.1588e+11 101431
## - KIDSDRIV                          1   52695492 1.1577e+11 101432
## - CAR_AGE                           1   59011572 1.1577e+11 101432
## - PARENT1_No                        1   84333238 1.1580e+11 101433
## - SEX                               1  103113408 1.1582e+11 101434
## - REVOKED_No                        1  124836799 1.1584e+11 101435
## - TRAVTIME                          1  173726188 1.1589e+11 101438
## - TIF                               1  204732923 1.1592e+11 101440
## - MSTATUS_Yes                       1  205027759 1.1592e+11 101440
## - CAR_USE                           1  328574414 1.1604e+11 101446
## - JOB                               7  679641790 1.1639e+11 101452
## - CAR_TYPE                          5  703295192 1.1642e+11 101458
## - MVR_PTS                           1  680841708 1.1639e+11 101464
## - `URBANICITY_Highly Urban/ Urban`  1 2195131212 1.1791e+11 101543
## 
## Step:  AIC=101429
## TARGET_AMT ~ KIDSDRIV + AGE + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + 
##     CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + 
##     RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - OLDCLAIM                          1    6593452 1.1572e+11 101427
## - AGE                               1    9376881 1.1572e+11 101427
## - RED_CAR_no                        1   18825907 1.1573e+11 101428
## - CLM_FREQ                          1   31754169 1.1575e+11 101429
## - INCOME                            1   31998738 1.1575e+11 101429
## - BLUEBOOK                          1   33220175 1.1575e+11 101429
## <none>                                           1.1571e+11 101429
## - HOME_VAL                          1   41847940 1.1576e+11 101429
## - EDUCATION                         4  162853225 1.1588e+11 101429
## - CAR_AGE                           1   59169562 1.1577e+11 101430
## - KIDSDRIV                          1   70513453 1.1579e+11 101431
## - SEX                               1  102991203 1.1582e+11 101432
## - PARENT1_No                        1  109747410 1.1582e+11 101433
## - REVOKED_No                        1  125018867 1.1584e+11 101433
## - TRAVTIME                          1  173346976 1.1589e+11 101436
## - TIF                               1  204467384 1.1592e+11 101438
## - MSTATUS_Yes                       1  213380590 1.1593e+11 101438
## - CAR_USE                           1  328956111 1.1604e+11 101444
## - JOB                               7  679673483 1.1639e+11 101450
## - CAR_TYPE                          5  706136197 1.1642e+11 101456
## - MVR_PTS                           1  681042854 1.1640e+11 101462
## - `URBANICITY_Highly Urban/ Urban`  1 2194627368 1.1791e+11 101541
## 
## Step:  AIC=101427.3
## TARGET_AMT ~ KIDSDRIV + AGE + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + CLM_FREQ + 
##     MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + RED_CAR_no + 
##     REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - AGE                               1    9496700 1.1573e+11 101426
## - RED_CAR_no                        1   18956277 1.1574e+11 101426
## - CLM_FREQ                          1   25160725 1.1575e+11 101427
## - INCOME                            1   32104527 1.1575e+11 101427
## - BLUEBOOK                          1   33214709 1.1575e+11 101427
## <none>                                           1.1572e+11 101427
## - HOME_VAL                          1   42208097 1.1576e+11 101428
## - EDUCATION                         4  163012416 1.1588e+11 101428
## - CAR_AGE                           1   58974407 1.1578e+11 101428
## - KIDSDRIV                          1   71368155 1.1579e+11 101429
## - SEX                               1  103856545 1.1583e+11 101431
## - PARENT1_No                        1  110362036 1.1583e+11 101431
## - REVOKED_No                        1  126806659 1.1585e+11 101432
## - TRAVTIME                          1  175530579 1.1590e+11 101434
## - TIF                               1  205216640 1.1593e+11 101436
## - MSTATUS_Yes                       1  213272987 1.1593e+11 101436
## - CAR_USE                           1  329642166 1.1605e+11 101443
## - JOB                               7  680837686 1.1640e+11 101449
## - CAR_TYPE                          5  707135458 1.1643e+11 101454
## - MVR_PTS                           1  674459473 1.1640e+11 101460
## - `URBANICITY_Highly Urban/ Urban`  1 2196176268 1.1792e+11 101539
## 
## Step:  AIC=101425.8
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + CLM_FREQ + 
##     MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + RED_CAR_no + 
##     REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - RED_CAR_no                        1   18272014 1.1575e+11 101425
## - CLM_FREQ                          1   24664667 1.1576e+11 101425
## - BLUEBOOK                          1   28813026 1.1576e+11 101425
## - INCOME                            1   30025417 1.1576e+11 101425
## <none>                                           1.1573e+11 101426
## - EDUCATION                         4  161207343 1.1589e+11 101426
## - HOME_VAL                          1   46304526 1.1578e+11 101426
## - CAR_AGE                           1   60004338 1.1579e+11 101427
## - KIDSDRIV                          1   70286166 1.1580e+11 101427
## - SEX                               1   97412578 1.1583e+11 101429
## - REVOKED_No                        1  127894375 1.1586e+11 101430
## - PARENT1_No                        1  140005485 1.1587e+11 101431
## - TRAVTIME                          1  174217504 1.1590e+11 101433
## - TIF                               1  205093505 1.1594e+11 101435
## - MSTATUS_Yes                       1  207097688 1.1594e+11 101435
## - CAR_USE                           1  330695298 1.1606e+11 101441
## - JOB                               7  685431673 1.1642e+11 101448
## - CAR_TYPE                          5  697976097 1.1643e+11 101452
## - MVR_PTS                           1  681418683 1.1641e+11 101459
## - `URBANICITY_Highly Urban/ Urban`  1 2203840642 1.1793e+11 101538
## 
## Step:  AIC=101424.8
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + CLM_FREQ + 
##     MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + REVOKED_No + 
##     `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - CLM_FREQ                          1   23956329 1.1577e+11 101424
## - INCOME                            1   30254178 1.1578e+11 101424
## - BLUEBOOK                          1   30521205 1.1578e+11 101424
## <none>                                           1.1575e+11 101425
## - HOME_VAL                          1   44880251 1.1579e+11 101425
## - EDUCATION                         4  163322639 1.1591e+11 101425
## - CAR_AGE                           1   60520141 1.1581e+11 101426
## - KIDSDRIV                          1   70732988 1.1582e+11 101426
## - SEX                               1   79511727 1.1583e+11 101427
## - REVOKED_No                        1  126885211 1.1588e+11 101429
## - PARENT1_No                        1  140730652 1.1589e+11 101430
## - TRAVTIME                          1  173608071 1.1592e+11 101432
## - TIF                               1  205474008 1.1595e+11 101433
## - MSTATUS_Yes                       1  207122826 1.1596e+11 101434
## - CAR_USE                           1  327908821 1.1608e+11 101440
## - JOB                               7  689640136 1.1644e+11 101447
## - CAR_TYPE                          5  706525174 1.1646e+11 101452
## - MVR_PTS                           1  680422210 1.1643e+11 101458
## - `URBANICITY_Highly Urban/ Urban`  1 2199102531 1.1795e+11 101537
## 
## Step:  AIC=101424
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + MVR_PTS + 
##     CAR_AGE + PARENT1_No + MSTATUS_Yes + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - BLUEBOOK                          1   29596008 1.1580e+11 101424
## - INCOME                            1   31216401 1.1580e+11 101424
## <none>                                           1.1577e+11 101424
## - HOME_VAL                          1   46864840 1.1582e+11 101424
## - EDUCATION                         4  163446543 1.1594e+11 101425
## - CAR_AGE                           1   60527935 1.1583e+11 101425
## - KIDSDRIV                          1   73065439 1.1585e+11 101426
## - SEX                               1   82269674 1.1586e+11 101426
## - REVOKED_No                        1  129665521 1.1590e+11 101429
## - PARENT1_No                        1  139933748 1.1591e+11 101429
## - TRAVTIME                          1  179501344 1.1595e+11 101431
## - TIF                               1  207364442 1.1598e+11 101433
## - MSTATUS_Yes                       1  211992738 1.1598e+11 101433
## - CAR_USE                           1  336641614 1.1611e+11 101440
## - JOB                               7  692555117 1.1647e+11 101446
## - CAR_TYPE                          5  726874745 1.1650e+11 101452
## - MVR_PTS                           1  882053637 1.1665e+11 101468
## - `URBANICITY_Highly Urban/ Urban`  1 2419672505 1.1819e+11 101547
## 
## Step:  AIC=101423.6
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + TIF + CAR_TYPE + MVR_PTS + CAR_AGE + 
##     PARENT1_No + MSTATUS_Yes + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## - INCOME                            1   22919033 1.1583e+11 101423
## <none>                                           1.1580e+11 101424
## - EDUCATION                         4  161017781 1.1596e+11 101424
## - HOME_VAL                          1   46298975 1.1585e+11 101424
## - SEX                               1   57222491 1.1586e+11 101425
## - CAR_AGE                           1   60948228 1.1586e+11 101425
## - KIDSDRIV                          1   73724835 1.1588e+11 101425
## - REVOKED_No                        1  129663744 1.1593e+11 101428
## - PARENT1_No                        1  137304720 1.1594e+11 101429
## - TRAVTIME                          1  181307621 1.1598e+11 101431
## - TIF                               1  206084010 1.1601e+11 101432
## - MSTATUS_Yes                       1  210296361 1.1601e+11 101433
## - CAR_USE                           1  338968629 1.1614e+11 101439
## - JOB                               7  690165033 1.1649e+11 101445
## - CAR_TYPE                          5  697296271 1.1650e+11 101450
## - MVR_PTS                           1  872200399 1.1667e+11 101467
## - `URBANICITY_Highly Urban/ Urban`  1 2419270022 1.1822e+11 101547
## 
## Step:  AIC=101422.8
## TARGET_AMT ~ KIDSDRIV + HOME_VAL + SEX + EDUCATION + JOB + TRAVTIME + 
##     CAR_USE + TIF + CAR_TYPE + MVR_PTS + CAR_AGE + PARENT1_No + 
##     MSTATUS_Yes + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df  Sum of Sq        RSS    AIC
## <none>                                           1.1583e+11 101423
## - EDUCATION                         4  153670907 1.1598e+11 101423
## - CAR_AGE                           1   62093976 1.1589e+11 101424
## - SEX                               1   66340374 1.1589e+11 101424
## - KIDSDRIV                          1   71139964 1.1590e+11 101424
## - HOME_VAL                          1  107710327 1.1593e+11 101426
## - REVOKED_No                        1  129311580 1.1595e+11 101428
## - PARENT1_No                        1  139715547 1.1597e+11 101428
## - TRAVTIME                          1  180088157 1.1601e+11 101430
## - MSTATUS_Yes                       1  188276349 1.1601e+11 101431
## - TIF                               1  206076086 1.1603e+11 101432
## - CAR_USE                           1  335947722 1.1616e+11 101438
## - JOB                               7  711659101 1.1654e+11 101446
## - CAR_TYPE                          5  711291671 1.1654e+11 101450
## - MVR_PTS                           1  884896051 1.1671e+11 101467
## - `URBANICITY_Highly Urban/ Urban`  1 2412424760 1.1824e+11 101545
stargazer(model1, model2, model3,
          type = "text"
          ) #summary stats of each of the linear models 
## 
## ==============================================================================================================
##                                                               Dependent variable:                             
##                                  -----------------------------------------------------------------------------
##                                                                   TARGET_AMT                                  
##                                             (1)                       (2)                       (3)           
## --------------------------------------------------------------------------------------------------------------
## KIDSDRIV                                 206.594*                  212.654*                  218.208*         
##                                          (125.525)                 (125.562)                 (113.517)        
##                                                                                                               
## AGE                                       -4.372                    -6.301                                    
##                                           (7.920)                   (7.811)                                   
##                                                                                                               
## HOMEKIDS                                  15.625                    21.378                                    
##                                          (72.424)                  (71.762)                                   
##                                                                                                               
## YOJ                                       -2.232                     5.743                                    
##                                          (16.524)                  (14.944)                                   
##                                                                                                               
## INCOME                                    -0.003                   -0.004**                                   
##                                           (0.002)                   (0.002)                                   
##                                                                                                               
## HOME_VAL                                  -0.001                    -0.001                   -0.001**         
##                                           (0.001)                   (0.001)                   (0.001)         
##                                                                                                               
## SEXz_F                                  -471.463**                -451.198**                 -300.608*        
##                                          (203.891)                 (202.442)                 (161.942)        
##                                                                                                               
## EDUCATIONBachelors                       -362.997                                            -403.359*        
##                                          (226.602)                                           (223.245)        
##                                                                                                               
## EDUCATIONMasters                         -303.552                                            -369.819         
##                                          (339.569)                                           (334.665)        
##                                                                                                               
## EDUCATIONPhD                              437.621                                             310.084         
##                                          (426.167)                                           (410.704)        
##                                                                                                               
## EDUCATIONz_High School                   -221.312                                            -238.057         
##                                          (186.963)                                           (186.204)        
##                                                                                                               
## JOBDoctor                              -1,191.010**                                        -1,265.235***      
##                                          (495.837)                                           (489.641)        
##                                                                                                               
## JOBHome Maker                             -96.718                                             -30.327         
##                                          (279.862)                                           (257.757)        
##                                                                                                               
## JOBLawyer                                 -68.964                                            -127.456         
##                                          (335.856)                                           (331.297)        
##                                                                                                               
## JOBManager                              -958.360***                                        -1,028.263***      
##                                          (260.469)                                           (253.514)        
##                                                                                                               
## JOBProfessional                           128.638                                             75.220          
##                                          (237.174)                                           (231.962)        
##                                                                                                               
## JOBStudent                               -276.939                                            -245.726         
##                                          (264.625)                                           (248.793)        
##                                                                                                               
## JOBz_Blue Collar                          156.855                                             118.662         
##                                          (209.903)                                           (206.701)        
##                                                                                                               
## TRAVTIME                                 10.864***                 11.628***                 11.035***        
##                                           (3.616)                   (3.619)                   (3.608)         
##                                                                                                               
## CAR_USEPrivate                          -755.804***               -867.408***               -763.519***       
##                                          (183.129)                 (139.336)                 (182.781)        
##                                                                                                               
## BLUEBOOK                                   0.013                     0.012                                    
##                                           (0.010)                   (0.010)                                   
##                                                                                                               
## TIF                                     -44.577***                -43.756***                -44.730***        
##                                          (13.679)                  (13.693)                  (13.672)         
##                                                                                                               
## CAR_TYPEPanel Truck                       475.550                   386.274                  639.777**        
##                                          (328.741)                 (315.181)                 (299.782)        
##                                                                                                               
## CAR_TYPEPickup                           405.725**                 363.335**                 388.787**        
##                                          (187.946)                 (184.355)                 (186.416)        
##                                                                                                               
## CAR_TYPESports Car                     1,264.272***              1,225.516***              1,194.577***       
##                                          (236.653)                 (237.135)                 (221.692)        
##                                                                                                               
## CAR_TYPEVan                              489.190**                 495.477**                 543.162**        
##                                          (241.837)                 (239.452)                 (235.256)        
##                                                                                                               
## CAR_TYPEz_SUV                           910.320***                885.654***                850.619***        
##                                          (194.784)                 (195.049)                 (179.335)        
##                                                                                                               
## OLDCLAIM                                  -0.005                    -0.005                                    
##                                           (0.008)                   (0.008)                                   
##                                                                                                               
## CLM_FREQ                                  79.504                    85.876                                    
##                                          (62.029)                  (62.150)                                   
##                                                                                                               
## MVR_PTS                                 172.825***                181.544***                183.338***        
##                                          (29.087)                  (29.070)                  (27.043)         
##                                                                                                               
## CAR_AGE                                  -25.318*                 -37.451***                 -25.954*         
##                                          (14.461)                  (11.360)                  (14.452)         
##                                                                                                               
## PARENT1_No                              -468.900**                -452.925**                -527.718***       
##                                          (224.356)                 (224.678)                 (195.897)        
##                                                                                                               
## MSTATUS_Yes                             -540.981***               -572.276***               -481.786***       
##                                          (167.028)                 (165.943)                 (154.065)        
##                                                                                                               
## RED_CAR_yes                              -169.136                  -193.872                                   
##                                          (171.081)                 (171.367)                                  
##                                                                                                               
## REVOKED_No                              -497.034**                -516.310***               -449.324***       
##                                          (195.367)                 (195.498)                 (173.376)        
##                                                                                                               
## `URBANICITY_Highly Urban/ Urban`       1,640.836***              1,505.856***              1,676.540***       
##                                          (153.684)                 (150.295)                 (149.774)        
##                                                                                                               
## Constant                               1,998.163***              1,988.602***              1,787.354***       
##                                          (568.905)                 (505.544)                 (411.610)        
##                                                                                                               
## --------------------------------------------------------------------------------------------------------------
## Observations                               6,045                     6,045                     6,045          
## R2                                         0.077                     0.070                     0.076          
## Adjusted R2                                0.071                     0.066                     0.071          
## Residual Std. Error                4,388.612 (df = 6008)     4,399.793 (df = 6019)     4,387.813 (df = 6016)  
## F Statistic                      13.827*** (df = 36; 6008) 18.150*** (df = 25; 6019) 17.576*** (df = 28; 6016)
## ==============================================================================================================
## Note:                                                                              *p<0.1; **p<0.05; ***p<0.01

Coefficient Interpretation

  • Age- If there is a one year increase in age, then there is a -4.372 dollar decrease in the amount spent if there was a crash all else being equal. This may make sense because as someone gets older, they may be less reckless when driving.

  • Job type- Having your job be a doctor is negatively correlated (-1,265.235) with the amount of money you would have to pay if you were in a crash. This makes sense because doctors tend to work a lot and be smart people, so they will be less likely to get in an accident and therefore have to pay money because of it. On the other hand blue collar workes

  • Motor vehicle points- If there is a one point increase in motor vehical points, then there is a 183.338 dollar increase in the amount spent if there was a crash, all else being equal. This makes sense because you gain motor vehicle points by getting tickets for speeding or reckless driving. So if someone has more points, that probably means they are not as safe of a driver as someone who does not have any points. And they will be more likely to get in a crash.

  • Travel time- If there is a one (minute or hour?) increase in travel time to work, then there is a 11.035 dollar increase in the amount spent if there was a crash, all else being equal. This makes sense because if someone drives more to work, they are spending more time on the road, so they may be more likely to get in a crash.

Linear Model Selection

  • R2 - Usually, the larger the R2, the better the regression model fits your observations. Between the three models, there is not one that is significantly better than the others. However, model 1 explains 7.7% of the observations and model 3 explains 7.6% of the observations.

  • F stat - This is the ratio of two variances. The F stat for each of the models is highly significant.

  • Residual standard error- The smaller the residual standard error, the better the regression model fits the data set.The residual standard error is a bit lower for model 3 (4,387) than it is for model 2 and model 1.

  • Model 1 contains jobs and education which were highly multicollinear, so I will not be going with that one. Because of the adjusted r2 and the stepAIC function, I am going to go with model 3.

plot(model3)

  1. Residual vs fitted values- Checks if residuals are randomly distributed about the graph and are both negative and positive. The residuals do not seem to be randomly distributed
  2. Normal Q-Q- Tells if residuals are normally distributed by comparing them with the actual distribution. A lot of the data is within 2 standard deviations, however there are a lot of outliers that are not following the distribution.
  3. Scale-location- Shows if residuals are spread equally among predictions in order to check heteroscedasticity. There does not seem to be heteroscedasticity based on this graph. There is no fanning out.
  4. Residuals vs leverage plot- shows influential data points that have a big effect on the linear model. There are quite a few outliers, but only a few that seem to have influence on the model.

Logistic regression

Logit Model 1

model_logit <- glm(formula = TARGET_FLAG ~ . , 
                   data    = insur_train_clean.d, 
                   family  = binomial (link = "logit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_logit)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = insur_train_clean.d)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.417e-03  -2.000e-08  -2.000e-08   2.000e-08   3.016e-03  
## 
## Coefficients: (5 not defined because of singularities)
##                                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)                        -3.360e+01  4.380e+03  -0.008    0.994
## TARGET_AMT                          2.650e-01  3.433e+00   0.077    0.938
## KIDSDRIV                            5.912e+00  1.019e+03   0.006    0.995
## AGE                                -3.261e-01  3.630e+01  -0.009    0.993
## HOMEKIDS                           -1.506e+00  8.913e+02  -0.002    0.999
## YOJ                                -9.020e-01  1.210e+02  -0.007    0.994
## INCOME                              1.876e-05  2.329e-02   0.001    0.999
## HOME_VAL                            4.109e-05  5.654e-03   0.007    0.994
## SEXz_F                             -3.347e+00  2.720e+03  -0.001    0.999
## EDUCATIONBachelors                 -3.468e+00  2.564e+03  -0.001    0.999
## EDUCATIONMasters                    1.441e+01  1.194e+04   0.001    0.999
## EDUCATIONPhD                        4.679e+00  8.190e+03   0.001    1.000
## EDUCATIONz_High School              1.434e+01  1.665e+03   0.009    0.993
## JOBDoctor                          -2.039e+02  2.976e+05  -0.001    0.999
## JOBHome Maker                      -1.215e+01  4.126e+03  -0.003    0.998
## JOBLawyer                          -5.558e+00  1.212e+04   0.000    1.000
## JOBManager                         -2.807e-01  5.761e+03   0.000    1.000
## JOBProfessional                    -5.576e+00  3.433e+04   0.000    1.000
## JOBStudent                          1.136e+01  2.619e+03   0.004    0.997
## JOBz_Blue Collar                    1.617e+01  2.654e+03   0.006    0.995
## TRAVTIME                           -2.008e-01  2.951e+01  -0.007    0.995
## CAR_USEPrivate                     -3.083e+00  1.314e+03  -0.002    0.998
## BLUEBOOK                           -1.111e-03  1.378e-01  -0.008    0.994
## TIF                                -1.120e+00  1.261e+02  -0.009    0.993
## CAR_TYPEPanel Truck                 1.219e+01  3.499e+04   0.000    1.000
## CAR_TYPEPickup                      1.359e+01  1.038e+03   0.013    0.990
## CAR_TYPESports Car                  5.215e+00  2.975e+03   0.002    0.999
## CAR_TYPEVan                        -3.290e+00  4.275e+03  -0.001    0.999
## CAR_TYPEz_SUV                      -1.400e+01  5.675e+03  -0.002    0.998
## OLDCLAIM                           -7.916e-04  6.893e-02  -0.011    0.991
## CLM_FREQ                            7.199e+00  4.179e+02   0.017    0.986
## MVR_PTS                            -2.031e-01  1.682e+02  -0.001    0.999
## CAR_AGE                             1.834e-01  1.529e+02   0.001    0.999
## PARENT1_No                          9.552e+00  2.272e+03   0.004    0.997
## PARENT1_Yes                                NA         NA      NA       NA
## MSTATUS_Yes                        -9.312e+00  8.186e+02  -0.011    0.991
## MSTATUS_z_No                               NA         NA      NA       NA
## RED_CAR_no                          4.338e+00  9.682e+02   0.004    0.996
## RED_CAR_yes                                NA         NA      NA       NA
## REVOKED_No                         -1.933e+00  1.534e+03  -0.001    0.999
## REVOKED_Yes                                NA         NA      NA       NA
## `URBANICITY_Highly Urban/ Urban`    2.318e+00  8.182e+02   0.003    0.998
## `URBANICITY_z_Highly Rural/ Rural`         NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.9909e+03  on 6044  degrees of freedom
## Residual deviance: 4.5966e-05  on 6007  degrees of freedom
## AIC: 76
## 
## Number of Fisher Scoring iterations: 25

Logit Model 2

model_logit2 <- stepAIC(object = glm(formula = TARGET_FLAG ~ . -TARGET_AMT,
                        data = insur_train_clean.d, 
                        family = binomial (link = "logit")),
                        direction = c("backward"))
## Start:  AIC=5436.27
## TARGET_FLAG ~ (TARGET_AMT + KIDSDRIV + AGE + HOMEKIDS + YOJ + 
##     INCOME + HOME_VAL + SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + 
##     BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + 
##     CAR_AGE + PARENT1_No + PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + 
##     RED_CAR_no + RED_CAR_yes + REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban` + 
##     `URBANICITY_z_Highly Rural/ Rural`) - TARGET_AMT
## 
## 
## Step:  AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes + 
##     REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes + 
##     REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + REVOKED_No + 
##     `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     PARENT1_Yes + MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
## 
## Step:  AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + 
##     MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df Deviance    AIC
## - CAR_AGE                           1   5362.5 5434.5
## - HOMEKIDS                          1   5362.6 5434.6
## - AGE                               1   5363.0 5435.0
## - YOJ                               1   5363.2 5435.2
## <none>                                  5362.3 5436.3
## - SEX                               1   5364.9 5436.9
## - RED_CAR_no                        1   5367.1 5439.1
## - INCOME                            1   5368.1 5440.1
## - OLDCLAIM                          1   5370.6 5442.6
## - PARENT1_No                        1   5372.8 5444.8
## - HOME_VAL                          1   5373.3 5445.3
## - EDUCATION                         4   5382.1 5448.1
## - BLUEBOOK                          1   5376.2 5448.2
## - MSTATUS_Yes                       1   5380.1 5452.1
## - KIDSDRIV                          1   5382.6 5454.6
## - CLM_FREQ                          1   5398.3 5470.3
## - TIF                               1   5401.0 5473.0
## - JOB                               7   5425.0 5485.0
## - TRAVTIME                          1   5413.7 5485.7
## - MVR_PTS                           1   5416.3 5488.3
## - REVOKED_No                        1   5424.1 5496.1
## - CAR_USE                           1   5424.5 5496.5
## - CAR_TYPE                          5   5444.9 5508.9
## - `URBANICITY_Highly Urban/ Urban`  1   5842.8 5914.8
## 
## Step:  AIC=5434.47
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL + 
##     SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + 
##     CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + 
##     RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df Deviance    AIC
## - HOMEKIDS                          1   5362.8 5432.8
## - AGE                               1   5363.2 5433.2
## - YOJ                               1   5363.4 5433.4
## <none>                                  5362.5 5434.5
## - SEX                               1   5365.1 5435.1
## - RED_CAR_no                        1   5367.3 5437.3
## - INCOME                            1   5368.4 5438.4
## - OLDCLAIM                          1   5370.8 5440.8
## - PARENT1_No                        1   5373.0 5443.0
## - HOME_VAL                          1   5373.3 5443.3
## - BLUEBOOK                          1   5376.3 5446.3
## - MSTATUS_Yes                       1   5380.3 5450.3
## - EDUCATION                         4   5387.3 5451.3
## - KIDSDRIV                          1   5382.8 5452.8
## - CLM_FREQ                          1   5398.4 5468.4
## - TIF                               1   5401.3 5471.3
## - JOB                               7   5425.2 5483.2
## - TRAVTIME                          1   5413.8 5483.8
## - MVR_PTS                           1   5416.5 5486.5
## - REVOKED_No                        1   5424.2 5494.2
## - CAR_USE                           1   5424.6 5494.6
## - CAR_TYPE                          5   5445.2 5507.2
## - `URBANICITY_Highly Urban/ Urban`  1   5843.0 5913.0
## 
## Step:  AIC=5432.76
## TARGET_FLAG ~ KIDSDRIV + AGE + YOJ + INCOME + HOME_VAL + SEX + 
##     EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + 
##     OLDCLAIM + CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + 
##     RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df Deviance    AIC
## - YOJ                               1   5363.5 5431.5
## - AGE                               1   5364.0 5432.0
## <none>                                  5362.8 5432.8
## - SEX                               1   5365.4 5433.4
## - RED_CAR_no                        1   5367.6 5435.6
## - INCOME                            1   5368.6 5436.6
## - OLDCLAIM                          1   5371.1 5439.1
## - HOME_VAL                          1   5373.8 5441.8
## - BLUEBOOK                          1   5376.6 5444.6
## - PARENT1_No                        1   5377.6 5445.6
## - MSTATUS_Yes                       1   5380.7 5448.7
## - EDUCATION                         4   5387.8 5449.8
## - KIDSDRIV                          1   5390.0 5458.0
## - CLM_FREQ                          1   5398.8 5466.8
## - TIF                               1   5401.6 5469.6
## - JOB                               7   5425.4 5481.4
## - TRAVTIME                          1   5414.0 5482.0
## - MVR_PTS                           1   5416.9 5484.9
## - REVOKED_No                        1   5424.8 5492.8
## - CAR_USE                           1   5425.1 5493.1
## - CAR_TYPE                          5   5445.7 5505.7
## - `URBANICITY_Highly Urban/ Urban`  1   5843.1 5911.1
## 
## Step:  AIC=5431.49
## TARGET_FLAG ~ KIDSDRIV + AGE + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + 
##     CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + RED_CAR_no + 
##     REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df Deviance    AIC
## - AGE                               1   5365.0 5431.0
## <none>                                  5363.5 5431.5
## - SEX                               1   5366.1 5432.1
## - RED_CAR_no                        1   5368.3 5434.3
## - INCOME                            1   5369.8 5435.8
## - OLDCLAIM                          1   5372.0 5438.0
## - HOME_VAL                          1   5374.5 5440.5
## - BLUEBOOK                          1   5377.4 5443.4
## - PARENT1_No                        1   5377.9 5443.9
## - EDUCATION                         4   5388.3 5448.3
## - MSTATUS_Yes                       1   5382.8 5448.8
## - KIDSDRIV                          1   5390.4 5456.4
## - CLM_FREQ                          1   5399.6 5465.6
## - TIF                               1   5402.6 5468.6
## - JOB                               7   5425.5 5479.5
## - TRAVTIME                          1   5414.6 5480.6
## - MVR_PTS                           1   5417.9 5483.9
## - REVOKED_No                        1   5425.6 5491.6
## - CAR_USE                           1   5426.2 5492.2
## - CAR_TYPE                          5   5446.8 5504.8
## - `URBANICITY_Highly Urban/ Urban`  1   5843.5 5909.5
## 
## Step:  AIC=5431
## TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION + 
##     JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + 
##     CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + RED_CAR_no + 
##     REVOKED_No + `URBANICITY_Highly Urban/ Urban`
## 
##                                    Df Deviance    AIC
## <none>                                  5365.0 5431.0
## - SEX                               1   5367.1 5431.1
## - RED_CAR_no                        1   5369.7 5433.7
## - INCOME                            1   5371.0 5435.0
## - OLDCLAIM                          1   5373.6 5437.6
## - HOME_VAL                          1   5376.9 5440.9
## - BLUEBOOK                          1   5380.7 5444.7
## - MSTATUS_Yes                       1   5383.5 5447.5
## - PARENT1_No                        1   5383.9 5447.9
## - EDUCATION                         4   5390.0 5448.0
## - KIDSDRIV                          1   5391.4 5455.4
## - CLM_FREQ                          1   5401.0 5465.0
## - TIF                               1   5404.0 5468.0
## - TRAVTIME                          1   5415.8 5479.8
## - JOB                               7   5428.4 5480.4
## - MVR_PTS                           1   5420.1 5484.1
## - REVOKED_No                        1   5427.4 5491.4
## - CAR_USE                           1   5427.8 5491.8
## - CAR_TYPE                          5   5447.1 5503.1
## - `URBANICITY_Highly Urban/ Urban`  1   5846.8 5910.8
summary(model_logit2)
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + SEX + 
##     EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + 
##     OLDCLAIM + CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + 
##     RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`, 
##     family = binomial(link = "logit"), data = insur_train_clean.d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5691  -0.7024  -0.3901   0.6201   3.1495  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.180e+00  2.635e-01  -4.477 7.58e-06 ***
## KIDSDRIV                          3.307e-01  6.390e-02   5.176 2.27e-07 ***
## INCOME                           -3.471e-06  1.424e-06  -2.439 0.014744 *  
## HOME_VAL                         -1.469e-06  4.267e-07  -3.443 0.000575 ***
## SEXz_F                           -1.861e-01  1.272e-01  -1.464 0.143274    
## EDUCATIONBachelors               -3.995e-01  1.231e-01  -3.245 0.001174 ** 
## EDUCATIONMasters                 -4.969e-01  1.905e-01  -2.609 0.009094 ** 
## EDUCATIONPhD                      4.064e-02  2.461e-01   0.165 0.868809    
## EDUCATIONz_High School           -7.396e-03  1.063e-01  -0.070 0.944554    
## JOBDoctor                        -9.037e-01  3.273e-01  -2.761 0.005755 ** 
## JOBHome Maker                    -2.872e-01  1.569e-01  -1.831 0.067098 .  
## JOBLawyer                        -1.650e-01  2.112e-01  -0.781 0.434854    
## JOBManager                       -1.091e+00  1.641e-01  -6.650 2.93e-11 ***
## JOBProfessional                  -2.958e-01  1.410e-01  -2.098 0.035938 *  
## JOBStudent                       -3.133e-01  1.446e-01  -2.166 0.030305 *  
## JOBz_Blue Collar                 -2.020e-01  1.201e-01  -1.682 0.092562 .  
## TRAVTIME                          1.557e-02  2.189e-03   7.114 1.13e-12 ***
## CAR_USEPrivate                   -8.324e-01  1.059e-01  -7.861 3.82e-15 ***
## BLUEBOOK                         -2.370e-05  6.031e-06  -3.930 8.51e-05 ***
## TIF                              -5.237e-02  8.537e-03  -6.134 8.56e-10 ***
## CAR_TYPEPanel Truck               7.121e-01  1.944e-01   3.664 0.000249 ***
## CAR_TYPEPickup                    5.525e-01  1.153e-01   4.793 1.65e-06 ***
## CAR_TYPESports Car                1.090e+00  1.451e-01   7.512 5.84e-14 ***
## CAR_TYPEVan                       5.778e-01  1.494e-01   3.866 0.000110 ***
## CAR_TYPEz_SUV                     8.112e-01  1.247e-01   6.508 7.63e-11 ***
## OLDCLAIM                         -1.326e-05  4.569e-06  -2.903 0.003700 ** 
## CLM_FREQ                          2.001e-01  3.318e-02   6.032 1.62e-09 ***
## MVR_PTS                           1.170e-01  1.585e-02   7.383 1.54e-13 ***
## PARENT1_No                       -4.730e-01  1.089e-01  -4.342 1.41e-05 ***
## MSTATUS_Yes                      -4.146e-01  9.568e-02  -4.334 1.47e-05 ***
## RED_CAR_no                        2.236e-01  1.031e-01   2.168 0.030141 *  
## REVOKED_No                       -8.551e-01  1.074e-01  -7.965 1.65e-15 ***
## `URBANICITY_Highly Urban/ Urban`  2.308e+00  1.244e-01  18.552  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6990.9  on 6044  degrees of freedom
## Residual deviance: 5365.0  on 6012  degrees of freedom
## AIC: 5431
## 
## Number of Fisher Scoring iterations: 5

Probit Model

model_probit <- (object = glm(formula = TARGET_FLAG ~ .  -RED_CAR_no -PARENT1_Yes -MSTATUS_z_No -REVOKED_Yes -`URBANICITY_z_Highly Rural/ Rural` -TARGET_AMT,
                    data = insur_train_clean.d , 
                    family = binomial (link = "probit")))
                    
summary(model_probit)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - RED_CAR_no - PARENT1_Yes - MSTATUS_z_No - 
##     REVOKED_Yes - `URBANICITY_z_Highly Rural/ Rural` - TARGET_AMT, 
##     family = binomial(link = "probit"), data = insur_train_clean.d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6127  -0.7222  -0.3921   0.6492   3.4816  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -4.353e-01  1.949e-01  -2.233 0.025530 *  
## KIDSDRIV                          1.808e-01  4.107e-02   4.403 1.07e-05 ***
## AGE                              -1.972e-03  2.706e-03  -0.729 0.466202    
## HOMEKIDS                          1.650e-02  2.474e-02   0.667 0.504891    
## YOJ                              -5.094e-03  5.652e-03  -0.901 0.367459    
## INCOME                           -1.921e-06  8.131e-07  -2.363 0.018148 *  
## HOME_VAL                         -7.368e-07  2.456e-07  -3.001 0.002693 ** 
## SEXz_F                           -1.234e-01  7.334e-02  -1.682 0.092507 .  
## EDUCATIONBachelors               -2.183e-01  7.684e-02  -2.841 0.004493 ** 
## EDUCATIONMasters                 -2.423e-01  1.216e-01  -1.993 0.046240 *  
## EDUCATIONPhD                      5.496e-02  1.502e-01   0.366 0.714393    
## EDUCATIONz_High School            4.212e-03  6.225e-02   0.068 0.946052    
## JOBDoctor                        -5.337e-01  1.831e-01  -2.915 0.003560 ** 
## JOBHome Maker                    -1.900e-01  9.660e-02  -1.967 0.049160 *  
## JOBLawyer                        -1.006e-01  1.202e-01  -0.837 0.402664    
## JOBManager                       -6.055e-01  9.278e-02  -6.527 6.73e-11 ***
## JOBProfessional                  -1.614e-01  8.186e-02  -1.972 0.048613 *  
## JOBStudent                       -1.837e-01  8.868e-02  -2.072 0.038311 *  
## JOBz_Blue Collar                 -1.102e-01  7.016e-02  -1.571 0.116228    
## TRAVTIME                          9.025e-03  1.255e-03   7.194 6.31e-13 ***
## CAR_USEPrivate                   -4.705e-01  6.145e-02  -7.657 1.91e-14 ***
## BLUEBOOK                         -1.268e-05  3.477e-06  -3.647 0.000265 ***
## TIF                              -3.099e-02  4.880e-03  -6.350 2.16e-10 ***
## CAR_TYPEPanel Truck               3.930e-01  1.125e-01   3.492 0.000479 ***
## CAR_TYPEPickup                    3.112e-01  6.596e-02   4.717 2.39e-06 ***
## CAR_TYPESports Car                6.396e-01  8.368e-02   7.644 2.11e-14 ***
## CAR_TYPEVan                       3.162e-01  8.574e-02   3.688 0.000226 ***
## CAR_TYPEz_SUV                     4.728e-01  7.123e-02   6.638 3.19e-11 ***
## OLDCLAIM                         -6.992e-06  2.670e-06  -2.619 0.008832 ** 
## CLM_FREQ                          1.196e-01  1.954e-02   6.122 9.25e-10 ***
## MVR_PTS                           6.721e-02  9.311e-03   7.219 5.25e-13 ***
## CAR_AGE                          -2.018e-03  5.089e-03  -0.397 0.691664    
## PARENT1_No                       -2.345e-01  7.370e-02  -3.181 0.001466 ** 
## MSTATUS_Yes                      -2.550e-01  5.784e-02  -4.408 1.04e-05 ***
## RED_CAR_yes                      -1.229e-01  5.935e-02  -2.071 0.038332 *  
## REVOKED_No                       -4.869e-01  6.261e-02  -7.776 7.46e-15 ***
## `URBANICITY_Highly Urban/ Urban`  1.264e+00  6.482e-02  19.508  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6990.9  on 6044  degrees of freedom
## Residual deviance: 5374.6  on 6008  degrees of freedom
## AIC: 5448.6
## 
## Number of Fisher Scoring iterations: 5
stargazer(model_logit, model_logit2, model_probit,
          type = "text")
## 
## ========================================================================
##                                             Dependent variable:         
##                                    -------------------------------------
##                                                 TARGET_FLAG             
##                                            logistic            probit   
##                                         (1)          (2)         (3)    
## ------------------------------------------------------------------------
## TARGET_AMT                             0.265                            
##                                       (3.433)                           
##                                                                         
## KIDSDRIV                               5.912      0.331***    0.181***  
##                                     (1,018.797)    (0.064)     (0.041)  
##                                                                         
## AGE                                   -0.326                   -0.002   
##                                      (36.302)                  (0.003)  
##                                                                         
## HOMEKIDS                              -1.506                    0.016   
##                                      (891.280)                 (0.025)  
##                                                                         
## YOJ                                   -0.902                   -0.005   
##                                      (120.996)                 (0.006)  
##                                                                         
## INCOME                                0.00002    -0.00000**  -0.00000** 
##                                       (0.023)     (0.00000)   (0.00000) 
##                                                                         
## HOME_VAL                              0.00004    -0.00000*** -0.00000***
##                                       (0.006)     (0.00000)   (0.00000) 
##                                                                         
## SEXz_F                                -3.347       -0.186      -0.123*  
##                                     (2,719.828)    (0.127)     (0.073)  
##                                                                         
## EDUCATIONBachelors                    -3.468      -0.399***   -0.218*** 
##                                     (2,564.242)    (0.123)     (0.077)  
##                                                                         
## EDUCATIONMasters                      14.409      -0.497***   -0.242**  
##                                    (11,936.760)    (0.190)     (0.122)  
##                                                                         
## EDUCATIONPhD                           4.679        0.041       0.055   
##                                     (8,189.922)    (0.246)     (0.150)  
##                                                                         
## EDUCATIONz_High School                14.337       -0.007       0.004   
##                                     (1,665.372)    (0.106)     (0.062)  
##                                                                         
## JOBDoctor                            -203.858     -0.904***   -0.534*** 
##                                    (297,612.500)   (0.327)     (0.183)  
##                                                                         
## JOBHome Maker                         -12.154      -0.287*    -0.190**  
##                                     (4,125.562)    (0.157)     (0.097)  
##                                                                         
## JOBLawyer                             -5.558       -0.165      -0.101   
##                                    (12,118.490)    (0.211)     (0.120)  
##                                                                         
## JOBManager                            -0.281      -1.091***   -0.606*** 
##                                     (5,761.114)    (0.164)     (0.093)  
##                                                                         
## JOBProfessional                       -5.576      -0.296**    -0.161**  
##                                    (34,325.550)    (0.141)     (0.082)  
##                                                                         
## JOBStudent                            11.360      -0.313**    -0.184**  
##                                     (2,619.123)    (0.145)     (0.089)  
##                                                                         
## JOBz_Blue Collar                      16.174       -0.202*     -0.110   
##                                     (2,653.715)    (0.120)     (0.070)  
##                                                                         
## TRAVTIME                              -0.201      0.016***    0.009***  
##                                      (29.505)      (0.002)     (0.001)  
##                                                                         
## CAR_USEPrivate                        -3.083      -0.832***   -0.471*** 
##                                     (1,314.093)    (0.106)     (0.061)  
##                                                                         
## BLUEBOOK                              -0.001     -0.00002*** -0.00001***
##                                       (0.138)     (0.00001)   (0.00000) 
##                                                                         
## TIF                                   -1.120      -0.052***   -0.031*** 
##                                      (126.129)     (0.009)     (0.005)  
##                                                                         
## CAR_TYPEPanel Truck                   12.190      0.712***    0.393***  
##                                    (34,992.480)    (0.194)     (0.113)  
##                                                                         
## CAR_TYPEPickup                        13.589      0.552***    0.311***  
##                                     (1,038.148)    (0.115)     (0.066)  
##                                                                         
## CAR_TYPESports Car                     5.215      1.090***    0.640***  
##                                     (2,975.369)    (0.145)     (0.084)  
##                                                                         
## CAR_TYPEVan                           -3.290      0.578***    0.316***  
##                                     (4,274.571)    (0.149)     (0.086)  
##                                                                         
## CAR_TYPEz_SUV                         -14.003     0.811***    0.473***  
##                                     (5,675.033)    (0.125)     (0.071)  
##                                                                         
## OLDCLAIM                              -0.001     -0.00001*** -0.00001***
##                                       (0.069)     (0.00000)   (0.00000) 
##                                                                         
## CLM_FREQ                               7.199      0.200***    0.120***  
##                                      (417.858)     (0.033)     (0.020)  
##                                                                         
## MVR_PTS                               -0.203      0.117***    0.067***  
##                                      (168.248)     (0.016)     (0.009)  
##                                                                         
## CAR_AGE                                0.183                   -0.002   
##                                      (152.948)                 (0.005)  
##                                                                         
## PARENT1_No                             9.552      -0.473***   -0.234*** 
##                                     (2,272.261)    (0.109)     (0.074)  
##                                                                         
## PARENT1_Yes                                                             
##                                                                         
##                                                                         
## MSTATUS_Yes                           -9.312      -0.415***   -0.255*** 
##                                      (818.561)     (0.096)     (0.058)  
##                                                                         
## MSTATUS_z_No                                                            
##                                                                         
##                                                                         
## RED_CAR_no                             4.338       0.224**              
##                                      (968.203)     (0.103)              
##                                                                         
## RED_CAR_yes                                                   -0.123**  
##                                                                (0.059)  
##                                                                         
## REVOKED_No                            -1.933      -0.855***   -0.487*** 
##                                     (1,533.957)    (0.107)     (0.063)  
##                                                                         
## REVOKED_Yes                                                             
##                                                                         
##                                                                         
## `URBANICITY_Highly Urban/ Urban`       2.318      2.308***    1.264***  
##                                      (818.160)     (0.124)     (0.065)  
##                                                                         
## `URBANICITY_z_Highly Rural/ Rural`                                      
##                                                                         
##                                                                         
## Constant                              -33.601     -1.180***   -0.435**  
##                                     (4,380.121)    (0.264)     (0.195)  
##                                                                         
## ------------------------------------------------------------------------
## Observations                           6,045        6,045       6,045   
## Log Likelihood                       -0.00002    -2,682.500  -2,687.279 
## Akaike Inf. Crit.                     76.000      5,431.000   5,448.557 
## ========================================================================
## Note:                                        *p<0.1; **p<0.05; ***p<0.01
exp(-.326)
## [1] 0.7218052
exp(.016)
## [1] 1.016129
exp(-0.399)
## [1] 0.6709907
exp(-14.003)
## [1] 8.290379e-07

Coefficient Interpretation

  • The people who have a Bachelors Degree have .671 times the odds of those who do not have a bachelors to get into a car crash.
  • The people who drive a SUV have 8.290379e-07 times the odds of those who do not drive a SUV to get into a car crash

Confusion Matrix

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
summary(model_logit)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = insur_train_clean.d)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.417e-03  -2.000e-08  -2.000e-08   2.000e-08   3.016e-03  
## 
## Coefficients: (5 not defined because of singularities)
##                                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)                        -3.360e+01  4.380e+03  -0.008    0.994
## TARGET_AMT                          2.650e-01  3.433e+00   0.077    0.938
## KIDSDRIV                            5.912e+00  1.019e+03   0.006    0.995
## AGE                                -3.261e-01  3.630e+01  -0.009    0.993
## HOMEKIDS                           -1.506e+00  8.913e+02  -0.002    0.999
## YOJ                                -9.020e-01  1.210e+02  -0.007    0.994
## INCOME                              1.876e-05  2.329e-02   0.001    0.999
## HOME_VAL                            4.109e-05  5.654e-03   0.007    0.994
## SEXz_F                             -3.347e+00  2.720e+03  -0.001    0.999
## EDUCATIONBachelors                 -3.468e+00  2.564e+03  -0.001    0.999
## EDUCATIONMasters                    1.441e+01  1.194e+04   0.001    0.999
## EDUCATIONPhD                        4.679e+00  8.190e+03   0.001    1.000
## EDUCATIONz_High School              1.434e+01  1.665e+03   0.009    0.993
## JOBDoctor                          -2.039e+02  2.976e+05  -0.001    0.999
## JOBHome Maker                      -1.215e+01  4.126e+03  -0.003    0.998
## JOBLawyer                          -5.558e+00  1.212e+04   0.000    1.000
## JOBManager                         -2.807e-01  5.761e+03   0.000    1.000
## JOBProfessional                    -5.576e+00  3.433e+04   0.000    1.000
## JOBStudent                          1.136e+01  2.619e+03   0.004    0.997
## JOBz_Blue Collar                    1.617e+01  2.654e+03   0.006    0.995
## TRAVTIME                           -2.008e-01  2.951e+01  -0.007    0.995
## CAR_USEPrivate                     -3.083e+00  1.314e+03  -0.002    0.998
## BLUEBOOK                           -1.111e-03  1.378e-01  -0.008    0.994
## TIF                                -1.120e+00  1.261e+02  -0.009    0.993
## CAR_TYPEPanel Truck                 1.219e+01  3.499e+04   0.000    1.000
## CAR_TYPEPickup                      1.359e+01  1.038e+03   0.013    0.990
## CAR_TYPESports Car                  5.215e+00  2.975e+03   0.002    0.999
## CAR_TYPEVan                        -3.290e+00  4.275e+03  -0.001    0.999
## CAR_TYPEz_SUV                      -1.400e+01  5.675e+03  -0.002    0.998
## OLDCLAIM                           -7.916e-04  6.893e-02  -0.011    0.991
## CLM_FREQ                            7.199e+00  4.179e+02   0.017    0.986
## MVR_PTS                            -2.031e-01  1.682e+02  -0.001    0.999
## CAR_AGE                             1.834e-01  1.529e+02   0.001    0.999
## PARENT1_No                          9.552e+00  2.272e+03   0.004    0.997
## PARENT1_Yes                                NA         NA      NA       NA
## MSTATUS_Yes                        -9.312e+00  8.186e+02  -0.011    0.991
## MSTATUS_z_No                               NA         NA      NA       NA
## RED_CAR_no                          4.338e+00  9.682e+02   0.004    0.996
## RED_CAR_yes                                NA         NA      NA       NA
## REVOKED_No                         -1.933e+00  1.534e+03  -0.001    0.999
## REVOKED_Yes                                NA         NA      NA       NA
## `URBANICITY_Highly Urban/ Urban`    2.318e+00  8.182e+02   0.003    0.998
## `URBANICITY_z_Highly Rural/ Rural`         NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6.9909e+03  on 6044  degrees of freedom
## Residual deviance: 4.5966e-05  on 6007  degrees of freedom
## AIC: 76
## 
## Number of Fisher Scoring iterations: 25
predicted <- predict(model_logit, data = insur_train_clean.d, type="response")

insur_train_clean.d$predicted <- predict(model_logit, data = insur_train_clean.d, type="response")

predicted_binary <- ifelse(test = insur_train_clean.d$predicted>.5,yes = 1,no = 0)

insur_train_clean.d$predicted_binary <- ifelse(test = insur_train_clean.d$predicted>.5,yes = 1,no = 0)

table(insur_train_clean.d$predicted_binary)
## 
##    0    1 
## 4443 1602
table(insur_train_clean.d$TARGET_FLAG)
## 
##    0    1 
## 4443 1602
table(insur_train_clean.d$TARGET_FLAG, insur_train_clean.d$predicted_binary)
##    
##        0    1
##   0 4443    0
##   1    0 1602
confusionMatrix(reference = factor(insur_train_clean.d$TARGET_FLAG),
                data      = factor(insur_train_clean.d$predicted_binary),
                positive  = "1"
                )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4443    0
##          1    0 1602
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9994, 1)
##     No Information Rate : 0.735      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.265      
##          Detection Rate : 0.265      
##    Detection Prevalence : 0.265      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : 1          
## 

In terms of the classification error rate (0), accuracy(1), sensitivity(1) and specificity(1), this is a perfect fitting model. However, this could be for a couple of different reasons. This model could be over fitting to the actual data set. In addition, a confusion matrix is usually done with the test data set. This confusion matrix was done with the training data set, and there were no errors.

insurance_predictions <-  insur_eval_clean.d %>%
  mutate(predictions = predict(model_logit, newdata= insur_eval_clean.d ))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `predictions = predict(model_logit, newdata =
##   insur_eval_clean.d)`.
## Caused by warning in `predict.lm()`:
## ! prediction from a rank-deficient fit may be misleading
head(insurance_predictions)
##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL SEX
## 1           0          0        0  60        0  11  67349        0   M
## 2           0          0        0  43        0  11  91449   257252   M
## 3           0          0        0  35        1  10  16039   124191 z_F
## 4           1       2946        0  34        1  12 125301        0 z_F
## 5           1       2501        0  34        0  10  62978        0 z_F
## 6           0          0        0  50        0   7 106952        0   M
##       EDUCATION           JOB TRAVTIME    CAR_USE BLUEBOOK TIF   CAR_TYPE
## 1           PhD  Professional       14    Private    14230  11    Minivan
## 2 z_High School z_Blue Collar       22 Commercial    14940   1    Minivan
## 3 z_High School      Clerical        5    Private     4010   4      z_SUV
## 4     Bachelors z_Blue Collar       46 Commercial    17430   1 Sports Car
## 5     Bachelors      Clerical       34    Private    11200   1      z_SUV
## 6     Bachelors  Professional       48 Commercial    18510   7        Van
##   OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE PARENT1_No PARENT1_Yes MSTATUS_Yes
## 1     4461        2       3      18          1           0           0
## 2        0        0       0       1          1           0           0
## 3    38690        2       3      10          1           0           1
## 4        0        0       0       7          0           1           0
## 5        0        0       0       1          1           0           0
## 6        0        0       1      17          1           0           0
##   MSTATUS_z_No RED_CAR_no RED_CAR_yes REVOKED_No REVOKED_Yes
## 1            1          0           1          1           0
## 2            1          0           1          1           0
## 3            0          1           0          1           0
## 4            1          1           0          1           0
## 5            1          1           0          1           0
## 6            1          1           0          1           0
##   URBANICITY_Highly Urban/ Urban URBANICITY_z_Highly Rural/ Rural predictions
## 1                              1                                0   -73.25617
## 2                              1                                0   -26.76305
## 3                              1                                0   -76.21662
## 4                              1                                0   716.75139
## 5                              1                                0   580.29333
## 6                              0                                1   -89.72293

This model predicted that those who got in a crash would be fairly younger than those who didn’t. But neither of them had motor vehical points and both had a bachelors degree which I am surprised about.