Overview

In this homework assignment, we will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero. Below is a short description of the variables of interest in the data set:

Objective

Our objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car.

library(e1071)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)
library(FactoMineR)
library(VIF)
library(knitr)
library(kableExtra)
library(Hmisc)
library(pROC)
library(binr)
library(MASS)

Data Exploration

train = 
  read.csv("https://raw.githubusercontent.com/mharrisonbaker/DATA621_GroupWork2/main/HW4/insurance_training_data.csv", 
           header = TRUE) %>% dplyr::select(-INDEX)
dim(train)
## [1] 8161   25
names(train)
##  [1] "TARGET_FLAG" "TARGET_AMT"  "KIDSDRIV"    "AGE"         "HOMEKIDS"   
##  [6] "YOJ"         "INCOME"      "PARENT1"     "HOME_VAL"    "MSTATUS"    
## [11] "SEX"         "EDUCATION"   "JOB"         "TRAVTIME"    "CAR_USE"    
## [16] "BLUEBOOK"    "TIF"         "CAR_TYPE"    "RED_CAR"     "OLDCLAIM"   
## [21] "CLM_FREQ"    "REVOKED"     "MVR_PTS"     "CAR_AGE"     "URBANICITY"
kable(train[1:15,]) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),latex_options="scale_down")
TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
0 0.000 0 60 0 11 $67,349 No $0 z_No M PhD Professional 14 Private $14,230 11 Minivan yes $4,461 2 No 3 18 Highly Urban/ Urban
0 0.000 0 43 0 11 $91,449 No $257,252 z_No M z_High School z_Blue Collar 22 Commercial $14,940 1 Minivan yes $0 0 No 0 1 Highly Urban/ Urban
0 0.000 0 35 1 10 $16,039 No $124,191 Yes z_F z_High School Clerical 5 Private $4,010 4 z_SUV no $38,690 2 No 3 10 Highly Urban/ Urban
0 0.000 0 51 0 14 No $306,251 Yes M <High School z_Blue Collar 32 Private $15,440 7 Minivan yes $0 0 No 0 6 Highly Urban/ Urban
0 0.000 0 50 0 NA $114,986 No $243,925 Yes z_F PhD Doctor 36 Private $18,000 1 z_SUV no $19,217 2 Yes 3 17 Highly Urban/ Urban
1 2946.000 0 34 1 12 $125,301 Yes $0 z_No z_F Bachelors z_Blue Collar 46 Commercial $17,430 1 Sports Car no $0 0 No 0 7 Highly Urban/ Urban
0 0.000 0 54 0 NA $18,755 No Yes z_F <High School z_Blue Collar 33 Private $8,780 1 z_SUV no $0 0 No 0 1 Highly Urban/ Urban
1 4021.000 1 37 2 NA $107,961 No $333,680 Yes M Bachelors z_Blue Collar 44 Commercial $16,970 1 Van yes $2,374 1 Yes 10 7 Highly Urban/ Urban
1 2501.000 0 34 0 10 $62,978 No $0 z_No z_F Bachelors Clerical 34 Private $11,200 1 z_SUV no $0 0 No 0 1 Highly Urban/ Urban
0 0.000 0 50 0 7 $106,952 No $0 z_No M Bachelors Professional 48 Commercial $18,510 7 Van no $0 0 No 1 17 z_Highly Rural/ Rural
1 6077.000 0 53 0 14 $77,100 No $0 z_No z_F Masters Lawyer 15 Private $18,300 1 Sports Car no $0 0 No 0 11 Highly Urban/ Urban
0 0.000 0 43 0 5 $52,642 No $209,970 Yes z_F Masters Professional 36 Private $22,420 7 Minivan no $0 0 No 0 1 z_Highly Rural/ Rural
0 0.000 0 55 0 11 $59,162 No $180,232 Yes M Bachelors Manager 25 Commercial $17,600 7 Van yes $5,028 2 Yes 3 9 Highly Urban/ Urban
1 1267.000 0 53 0 11 $130,795 No $0 z_No M PhD 64 Commercial $28,340 6 Panel Truck yes $0 0 No 3 10 Highly Urban/ Urban
1 2920.167 0 45 0 0 $0 No $106,859 Yes z_F <High School Home Maker 48 Private $6,000 1 z_SUV no $0 0 No 3 5 Highly Urban/ Urban
#summarize training data
kable(psych::describe(train), digits = 3) %>% kable_styling(bootstrap_options = c("striped", "hover", "responsive"),latex_options="scale_down")
vars n mean sd median trimmed mad min max range skew kurtosis se
TARGET_FLAG 1 8161 0.264 0.441 0 0.205 0.000 0 1.0 1.0 1.072 -0.852 0.005
TARGET_AMT 2 8161 1504.325 4704.027 0 593.712 0.000 0 107586.1 107586.1 8.706 112.288 52.071
KIDSDRIV 3 8161 0.171 0.512 0 0.025 0.000 0 4.0 4.0 3.352 11.780 0.006
AGE 4 8155 44.790 8.628 45 44.831 8.896 16 81.0 65.0 -0.029 -0.062 0.096
HOMEKIDS 5 8161 0.721 1.116 0 0.497 0.000 0 5.0 5.0 1.341 0.649 0.012
YOJ 6 7707 10.499 4.092 11 11.071 2.965 0 23.0 23.0 -1.203 1.177 0.047
INCOME* 7 8161 2875.551 2090.679 2817 2816.953 2799.149 1 6613.0 6612.0 0.109 -1.285 23.143
PARENT1* 8 8161 1.132 0.338 1 1.040 0.000 1 2.0 1.0 2.174 2.728 0.004
HOME_VAL* 9 8161 1684.893 1697.379 1245 1516.499 1842.872 1 5107.0 5106.0 0.516 -1.181 18.789
MSTATUS* 10 8161 1.400 0.490 1 1.375 0.000 1 2.0 1.0 0.407 -1.835 0.005
SEX* 11 8161 1.536 0.499 2 1.545 0.000 1 2.0 1.0 -0.145 -1.979 0.006
EDUCATION* 12 8161 3.091 1.445 3 3.113 1.483 1 5.0 4.0 0.116 -1.380 0.016
JOB* 13 8161 5.687 2.682 6 5.815 2.965 1 9.0 8.0 -0.307 -1.222 0.030
TRAVTIME 14 8161 33.486 15.908 33 32.995 16.309 5 142.0 137.0 0.447 0.664 0.176
CAR_USE* 15 8161 1.629 0.483 2 1.661 0.000 1 2.0 1.0 -0.533 -1.716 0.005
BLUEBOOK* 16 8161 1283.619 893.512 1124 1259.567 1132.706 1 2789.0 2788.0 0.247 -1.362 9.891
TIF 17 8161 5.351 4.147 4 4.840 4.448 1 25.0 24.0 0.891 0.422 0.046
CAR_TYPE* 18 8161 3.530 1.965 3 3.537 2.965 1 6.0 5.0 -0.005 -1.517 0.022
RED_CAR* 19 8161 1.291 0.454 1 1.239 0.000 1 2.0 1.0 0.918 -1.157 0.005
OLDCLAIM* 20 8161 552.271 862.201 1 380.320 0.000 1 2857.0 2856.0 1.309 0.246 9.544
CLM_FREQ 21 8161 0.799 1.158 0 0.589 0.000 0 5.0 5.0 1.209 0.284 0.013
REVOKED* 22 8161 1.123 0.328 1 1.028 0.000 1 2.0 1.0 2.302 3.299 0.004
MVR_PTS 23 8161 1.696 2.147 1 1.314 1.483 0 13.0 13.0 1.348 1.375 0.024
CAR_AGE 24 7651 8.328 5.701 8 7.963 7.413 -3 28.0 31.0 0.282 -0.749 0.065
URBANICITY* 25 8161 1.205 0.403 1 1.131 0.000 1 2.0 1.0 1.465 0.146 0.004

There are 8161 records and 26 variables in this data set with some missing values in few selected variables such as AGE, YOJ and CAR_AGE. These variables may require imputation using median values or other methods. The data consists of two response variables: TARGET_FLAG and TARGET_AMT. TARGET_FLAG is a binary variable with Yes/No indicating if the car was involved in a crash. TARGET_AMT is the cost of the crash.

Visual Exploration

Boxplots

The below boxplots will show all of the variables listed in the dataset and how the data is spread for each variable.

library(reshape)
ggplot(melt(train), aes(x=factor(variable), y=value)) + 
  facet_wrap(~variable, scale="free") + 
  geom_boxplot()

Histograms

ggplot(melt(train), aes(x=value)) + 
  facet_wrap(~variable, scale="free") + 
  geom_histogram(bins=50)

Data Preparation

Data seems to somewhat unstructured upon loading to R. As an example, income need to converted to a numeric value. Some of the data has extra character “z_” before variables. These need to be removed from data.

Transform data

This step is required to clean the data in order to analyze it.

Remove $ sign from the INCOME, HOME_VAL, BLUEBOOK and OLDCLAIM. replace " " with underscore “_" of variables: EDUCATION, JOB, CAR_TYPE, URBANICITY. Change it as factors for above variables plus TARGET_FRAG

currencyconv = function(input) {
  out = sub("\\$", "", input)
  out = as.numeric(sub(",", "", out))
  return(out)
}
# Replace spaces with underscores
underscore = function(input) {
  out = sub(" ", "_", input)
  return(out)
}
train = as.tbl(train) %>% 
  mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
            currencyconv) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            underscore) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            as.factor) %>% 
  mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
#check data
summary(train) %>% kable() %>% kable_styling()
TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
0:6008 Min. : 0 Min. :0.0000 Min. :16.00 Min. :0.0000 Min. : 0.0 Min. : 0 No :7084 Min. : 0 Yes :4894 M :3786 <High_School :1203 z_Blue_Collar:1825 Min. : 5.00 Commercial:3029 Min. : 1500 Min. : 1.000 Minivan :2145 no :5783 Min. : 0 Min. :0.0000 No :7161 Min. : 0.000 Min. :-3.000 Highly_Urban/ Urban :6492
1:2153 1st Qu.: 0 1st Qu.:0.0000 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.0 1st Qu.: 28097 Yes:1077 1st Qu.: 0 z_No:3267 z_F:4375 Bachelors :2242 Clerical :1271 1st Qu.: 22.00 Private :5132 1st Qu.: 9280 1st Qu.: 1.000 Panel_Truck: 676 yes:2378 1st Qu.: 0 1st Qu.:0.0000 Yes:1000 1st Qu.: 0.000 1st Qu.: 1.000 z_Highly_Rural/ Rural:1669
NA Median : 0 Median :0.0000 Median :45.00 Median :0.0000 Median :11.0 Median : 54028 NA Median :161160 NA NA Masters :1658 Professional :1117 Median : 33.00 NA Median :14440 Median : 4.000 Pickup :1389 NA Median : 0 Median :0.0000 NA Median : 1.000 Median : 8.000 NA
NA Mean : 1504 Mean :0.1711 Mean :44.79 Mean :0.7212 Mean :10.5 Mean : 61898 NA Mean :154867 NA NA PhD : 728 Manager : 988 Mean : 33.49 NA Mean :15710 Mean : 5.351 Sports_Car : 907 NA Mean : 4037 Mean :0.7986 NA Mean : 1.696 Mean : 8.328 NA
NA 3rd Qu.: 1036 3rd Qu.:0.0000 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.0 3rd Qu.: 85986 NA 3rd Qu.:238724 NA NA z_High_School:2330 Lawyer : 835 3rd Qu.: 44.00 NA 3rd Qu.:20850 3rd Qu.: 7.000 Van : 750 NA 3rd Qu.: 4636 3rd Qu.:2.0000 NA 3rd Qu.: 3.000 3rd Qu.:12.000 NA
NA Max. :107586 Max. :4.0000 Max. :81.00 Max. :5.0000 Max. :23.0 Max. :367030 NA Max. :885282 NA NA NA Student : 712 Max. :142.00 NA Max. :69740 Max. :25.000 z_SUV :2294 NA Max. :57037 Max. :5.0000 NA Max. :13.000 Max. :28.000 NA
NA NA NA NA’s :6 NA NA’s :454 NA’s :445 NA NA’s :464 NA NA NA (Other) :1413 NA NA NA NA NA NA NA NA NA NA NA’s :510 NA

NA count for each column:

na_count <- sapply(train, function(x) sum(is.na(x))) %>% kable() %>% kable_styling()
# na_count 
ntrain<-select_if(train, is.numeric)
ntrain %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density() 

Imputation of missing (NA) values

Our data exploration revealed that there are multiple variables with missing values. There are several ways to treat this situation: deleting the observations with NA values, deleting the variables that has NA values, imputation with the mean/median/mode or imputation with a prediction. For this scenario we will imputing the missing data with mean.

train$AGE[is.na(train$AGE)] <- mean(train$AGE, na.rm=TRUE)
train$YOJ[is.na(train$YOJ)] <- mean(train$YOJ, na.rm=TRUE)
train$HOME_VAL[is.na(train$HOME_VAL)] <- mean(train$HOME_VAL, na.rm=TRUE)
train$CAR_AGE[is.na(train$CAR_AGE)] <- mean(train$CAR_AGE, na.rm=TRUE)
train$INCOME[is.na(train$INCOME)] <- mean(train$INCOME, na.rm=TRUE)
train <- train[complete.cases(train),]
visdat::vis_miss(train)

Missing data have been fixed.

More data preparation…

# get complete cases
train <- train[complete.cases(train),]

# save the completed clean data into a new dataframe
train_clean <- train



# INCOME, HOME_VAL, BLUEBOOK, and OLDCLAIM are represented as strings. So we will be extracting the numeric values for these.

train_clean = as.tbl(train_clean) %>% 
  mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
            currencyconv) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            underscore) %>% 
  mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
            as.factor) %>%
  mutate(TARGET_FLAG = as.numeric(TARGET_FLAG))



# transform data using log for skewed HOMEKIDS, MVR_PTS, OLDCLAIM, TIF, KIDSDRIVE and CLM_FREQ 
#Convert indicator variables to 0s and 1s; 1 = Yes, Male for Sex, Commercial for Car Use, Red for RED_CAR, and Highly Urban for URBANICITY
train_clean$PARENT1 <- ifelse(train_clean$PARENT1=="Yes", 1, 0)
train_clean$MSTATUS <- ifelse(train_clean$MSTATUS=="Yes", 1, 0)
train_clean$SEX <- ifelse(train_clean$SEX=="M", 1, 0)
train_clean$CAR_USE <- ifelse(train_clean$CAR_USE=="Commercial", 1, 0)
train_clean$RED_CAR <- ifelse(train_clean$RED_CAR=="Yes", 1, 0)
train_clean$REVOKED <- ifelse(train_clean$REVOKED=="Yes", 1, 0)
train_clean$URBANICITY <- ifelse(train_clean$URBANICITY == "Highly Urban/ Urban", 1, 0)
train_clean$TARGET_FLAG <- ifelse(train_clean$TARGET_FLAG == 2, 0, 1)
#Convert categorical predictor values to indicator variables - EDUCATION, CAR_TYPE, JOB
#EDUCATION, High school graduate is base case
train_clean$HSDropout <- ifelse(train_clean$EDUCATION=="<High School", 1, 0)
train_clean$Bachelors <- ifelse(train_clean$EDUCATION=="Bachelors", 1, 0)
train_clean$Masters <- ifelse(train_clean$EDUCATION=="Masters", 1, 0)
train_clean$PhD <- ifelse(train_clean$EDUCATION=="PhD", 1, 0)
#CAR_TYPE, base case is minivan
train_clean$Panel_Truck <- ifelse(train_clean$CAR_TYPE=="Panel Truck", 1, 0)
train_clean$Pickup <- ifelse(train_clean$CAR_TYPE=="Pickup", 1, 0)
train_clean$Sports_Car <- ifelse(train_clean$CAR_TYPE=="Sports Car", 1, 0)
train_clean$Van <- ifelse(train_clean$CAR_TYPE=="Van", 1, 0)
train_clean$SUV <- ifelse(train_clean$CAR_TYPE=="z_SUV", 1, 0)
#JOB, base case is ""
train_clean$Professional <- ifelse(train_clean$JOB == "Professional", 1, 0)
train_clean$Blue_Collar <- ifelse(train_clean$JOB == "Professional", 1, 0)
train_clean$Clerical <- ifelse(train_clean$JOB == "Clerical", 1, 0)
train_clean$Doctor <- ifelse(train_clean$JOB == "Doctor", 1, 0)
train_clean$Lawyer <- ifelse(train_clean$JOB == "Lawyer", 1, 0)
train_clean$Manager <- ifelse(train_clean$JOB == "Manager", 1, 0)
train_clean$Home_Maker <- ifelse(train_clean$JOB == "Home Maker", 1, 0)
train_clean$Student <- ifelse(train_clean$JOB == "Student", 1, 0)

# check for missing data
visdat::vis_miss(train_clean)

# datatype and variable name
str(train_clean)
## tibble [8,161 x 42] (S3: tbl_df/tbl/data.frame)
##  $ TARGET_FLAG : num [1:8161] 1 1 1 1 1 0 1 0 0 1 ...
##  $ TARGET_AMT  : num [1:8161] 0 0 0 0 0 ...
##  $ KIDSDRIV    : int [1:8161] 0 0 0 0 0 0 0 1 0 0 ...
##  $ AGE         : num [1:8161] 60 43 35 51 50 34 54 37 34 50 ...
##  $ HOMEKIDS    : int [1:8161] 0 0 1 0 0 1 0 2 0 0 ...
##  $ YOJ         : num [1:8161] 11 11 10 14 10.5 ...
##  $ INCOME      : num [1:8161] 67349 91449 16039 61898 114986 ...
##  $ PARENT1     : num [1:8161] 0 0 0 0 0 1 0 0 0 0 ...
##  $ HOME_VAL    : num [1:8161] 0 257252 124191 306251 243925 ...
##  $ MSTATUS     : num [1:8161] 0 0 1 1 1 0 1 1 0 0 ...
##  $ SEX         : num [1:8161] 1 1 0 1 0 0 0 1 0 1 ...
##  $ EDUCATION   : Factor w/ 5 levels "<High_School",..: 4 5 5 1 4 2 1 2 2 2 ...
##  $ JOB         : Factor w/ 9 levels "","Clerical",..: 7 9 2 9 3 9 9 9 2 7 ...
##  $ TRAVTIME    : int [1:8161] 14 22 5 32 36 46 33 44 34 48 ...
##  $ CAR_USE     : num [1:8161] 0 1 0 0 0 1 0 1 0 1 ...
##  $ BLUEBOOK    : num [1:8161] 14230 14940 4010 15440 18000 ...
##  $ TIF         : int [1:8161] 11 1 4 7 1 1 1 1 1 7 ...
##  $ CAR_TYPE    : Factor w/ 6 levels "Minivan","Panel_Truck",..: 1 1 6 1 6 4 6 5 6 5 ...
##  $ RED_CAR     : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ OLDCLAIM    : num [1:8161] 4461 0 38690 0 19217 ...
##  $ CLM_FREQ    : int [1:8161] 2 0 2 0 2 0 0 1 0 0 ...
##  $ REVOKED     : num [1:8161] 0 0 0 0 1 0 0 1 0 0 ...
##  $ MVR_PTS     : int [1:8161] 3 0 3 0 3 0 0 10 0 1 ...
##  $ CAR_AGE     : num [1:8161] 18 1 10 6 17 7 1 7 1 17 ...
##  $ URBANICITY  : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ HSDropout   : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Bachelors   : num [1:8161] 0 0 0 0 0 1 0 1 1 1 ...
##  $ Masters     : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PhD         : num [1:8161] 1 0 0 0 1 0 0 0 0 0 ...
##  $ Panel_Truck : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Pickup      : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Sports_Car  : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Van         : num [1:8161] 0 0 0 0 0 0 0 1 0 1 ...
##  $ SUV         : num [1:8161] 0 0 1 0 1 0 1 0 1 0 ...
##  $ Professional: num [1:8161] 1 0 0 0 0 0 0 0 0 1 ...
##  $ Blue_Collar : num [1:8161] 1 0 0 0 0 0 0 0 0 1 ...
##  $ Clerical    : num [1:8161] 0 0 1 0 0 0 0 0 1 0 ...
##  $ Doctor      : num [1:8161] 0 0 0 0 1 0 0 0 0 0 ...
##  $ Lawyer      : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Manager     : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Home_Maker  : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Student     : num [1:8161] 0 0 0 0 0 0 0 0 0 0 ...
write.csv(train_clean, file = "train_clean.csv")
getwd()
## [1] "C:/Users/Udaya/Downloads"
trainnum <- dplyr::select_if(train, is.numeric)
rcorr(as.matrix(trainnum))
##            TARGET_AMT KIDSDRIV   AGE HOMEKIDS   YOJ INCOME HOME_VAL TRAVTIME
## TARGET_AMT       1.00     0.06 -0.04     0.06 -0.02  -0.06    -0.08     0.03
## KIDSDRIV         0.06     1.00 -0.08     0.46  0.04  -0.05    -0.02     0.01
## AGE             -0.04    -0.08  1.00    -0.45  0.13   0.18     0.20     0.01
## HOMEKIDS         0.06     0.46 -0.45     1.00  0.08  -0.16    -0.11    -0.01
## YOJ             -0.02     0.04  0.13     0.08  1.00   0.27     0.26    -0.02
## INCOME          -0.06    -0.05  0.18    -0.16  0.27   1.00     0.54    -0.05
## HOME_VAL        -0.08    -0.02  0.20    -0.11  0.26   0.54     1.00    -0.03
## TRAVTIME         0.03     0.01  0.01    -0.01 -0.02  -0.05    -0.03     1.00
## BLUEBOOK         0.00    -0.02  0.16    -0.11  0.14   0.42     0.25    -0.02
## TIF             -0.05     0.00  0.00     0.01  0.02   0.00     0.00    -0.01
## OLDCLAIM         0.07     0.02 -0.03     0.03  0.00  -0.04    -0.07    -0.02
## CLM_FREQ         0.12     0.04 -0.02     0.03 -0.03  -0.05    -0.09     0.01
## MVR_PTS          0.14     0.05 -0.07     0.06 -0.04  -0.06    -0.08     0.01
## CAR_AGE         -0.06    -0.05  0.17    -0.15  0.06   0.39     0.20    -0.04
##            BLUEBOOK   TIF OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT     0.00 -0.05     0.07     0.12    0.14   -0.06
## KIDSDRIV      -0.02  0.00     0.02     0.04    0.05   -0.05
## AGE            0.16  0.00    -0.03    -0.02   -0.07    0.17
## HOMEKIDS      -0.11  0.01     0.03     0.03    0.06   -0.15
## YOJ            0.14  0.02     0.00    -0.03   -0.04    0.06
## INCOME         0.42  0.00    -0.04    -0.05   -0.06    0.39
## HOME_VAL       0.25  0.00    -0.07    -0.09   -0.08    0.20
## TRAVTIME      -0.02 -0.01    -0.02     0.01    0.01   -0.04
## BLUEBOOK       1.00 -0.01    -0.03    -0.04   -0.04    0.18
## TIF           -0.01  1.00    -0.02    -0.02   -0.04    0.01
## OLDCLAIM      -0.03 -0.02     1.00     0.50    0.26   -0.01
## CLM_FREQ      -0.04 -0.02     0.50     1.00    0.40   -0.01
## MVR_PTS       -0.04 -0.04     0.26     0.40    1.00   -0.02
## CAR_AGE        0.18  0.01    -0.01    -0.01   -0.02    1.00
## 
## n= 8161 
## 
## 
## P
##            TARGET_AMT KIDSDRIV AGE    HOMEKIDS YOJ    INCOME HOME_VAL TRAVTIME
## TARGET_AMT            0.0000   0.0002 0.0000   0.0585 0.0000 0.0000   0.0115  
## KIDSDRIV   0.0000              0.0000 0.0000   0.0002 0.0000 0.0803   0.4455  
## AGE        0.0002     0.0000          0.0000   0.0000 0.0000 0.0000   0.6342  
## HOMEKIDS   0.0000     0.0000   0.0000          0.0000 0.0000 0.0000   0.5128  
## YOJ        0.0585     0.0002   0.0000 0.0000          0.0000 0.0000   0.1362  
## INCOME     0.0000     0.0000   0.0000 0.0000   0.0000        0.0000   0.0000  
## HOME_VAL   0.0000     0.0803   0.0000 0.0000   0.0000 0.0000          0.0018  
## TRAVTIME   0.0115     0.4455   0.6342 0.5128   0.1362 0.0000 0.0018           
## BLUEBOOK   0.6712     0.0516   0.0000 0.0000   0.0000 0.0000 0.0000   0.1246  
## TIF        0.0000     0.8574   0.9952 0.2859   0.0294 0.9274 0.8569   0.2945  
## OLDCLAIM   0.0000     0.0653   0.0082 0.0069   0.7931 0.0000 0.0000   0.0818  
## CLM_FREQ   0.0000     0.0008   0.0296 0.0080   0.0210 0.0000 0.0000   0.5535  
## MVR_PTS    0.0000     0.0000   0.0000 0.0000   0.0009 0.0000 0.0000   0.3384  
## CAR_AGE    0.0000     0.0000   0.0000 0.0000   0.0000 0.0000 0.0000   0.0009  
##            BLUEBOOK TIF    OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT 0.6712   0.0000 0.0000   0.0000   0.0000  0.0000 
## KIDSDRIV   0.0516   0.8574 0.0653   0.0008   0.0000  0.0000 
## AGE        0.0000   0.9952 0.0082   0.0296   0.0000  0.0000 
## HOMEKIDS   0.0000   0.2859 0.0069   0.0080   0.0000  0.0000 
## YOJ        0.0000   0.0294 0.7931   0.0210   0.0009  0.0000 
## INCOME     0.0000   0.9274 0.0000   0.0000   0.0000  0.0000 
## HOME_VAL   0.0000   0.8569 0.0000   0.0000   0.0000  0.0000 
## TRAVTIME   0.1246   0.2945 0.0818   0.5535   0.3384  0.0009 
## BLUEBOOK            0.6242 0.0077   0.0010   0.0004  0.0000 
## TIF        0.6242          0.0473   0.0375   0.0002  0.4971 
## OLDCLAIM   0.0077   0.0473          0.0000   0.0000  0.2402 
## CLM_FREQ   0.0010   0.0375 0.0000            0.0000  0.4151 
## MVR_PTS    0.0004   0.0002 0.0000   0.0000           0.0816 
## CAR_AGE    0.0000   0.4971 0.2402   0.4151   0.0816
corrplot(cor(trainnum), method="square")

Build Models

Model 1

Out first model consists of all variables.

model1 <- lm(TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + PARENT1 + HOME_VAL + MSTATUS + SEX + 
               TRAVTIME + CAR_USE + BLUEBOOK + TIF + RED_CAR + OLDCLAIM + CLM_FREQ + REVOKED + MVR_PTS + CAR_AGE + URBANICITY +
               HSDropout + Bachelors + Masters + PhD + Panel_Truck + Pickup + Sports_Car + Van + SUV + Professional + Blue_Collar +
               Clerical + Doctor + Lawyer + Manager + Home_Maker + Student, data = train_clean)


summary(model1)
## 
## Call:
## lm(formula = TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + 
##     INCOME + PARENT1 + HOME_VAL + MSTATUS + SEX + TRAVTIME + 
##     CAR_USE + BLUEBOOK + TIF + RED_CAR + OLDCLAIM + CLM_FREQ + 
##     REVOKED + MVR_PTS + CAR_AGE + URBANICITY + HSDropout + Bachelors + 
##     Masters + PhD + Panel_Truck + Pickup + Sports_Car + Van + 
##     SUV + Professional + Blue_Collar + Clerical + Doctor + Lawyer + 
##     Manager + Home_Maker + Student, data = train_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0736 -0.3240  0.1376  0.2780  0.9304 
## 
## Coefficients: (7 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.081e-01  3.566e-02  19.859  < 2e-16 ***
## KIDSDRIV     -5.225e-02  1.004e-02  -5.206 1.98e-07 ***
## AGE           8.117e-05  6.206e-04   0.131 0.895933    
## HOMEKIDS     -6.566e-03  5.786e-03  -1.135 0.256557    
## YOJ           1.537e-03  1.279e-03   1.202 0.229362    
## INCOME        3.099e-07  1.535e-07   2.020 0.043457 *  
## PARENT1      -7.849e-02  1.794e-02  -4.376 1.23e-05 ***
## HOME_VAL      1.588e-07  5.244e-08   3.028 0.002468 ** 
## MSTATUS       6.241e-02  1.285e-02   4.855 1.23e-06 ***
## SEX           2.729e-02  1.130e-02   2.416 0.015725 *  
## TRAVTIME     -9.825e-04  2.828e-04  -3.474 0.000515 ***
## CAR_USE      -1.269e-01  1.225e-02 -10.365  < 2e-16 ***
## BLUEBOOK      3.583e-06  6.439e-07   5.565 2.71e-08 ***
## TIF           7.508e-03  1.081e-03   6.943 4.13e-12 ***
## RED_CAR              NA         NA      NA       NA    
## OLDCLAIM      2.393e-06  6.604e-07   3.623 0.000293 ***
## CLM_FREQ     -5.645e-02  4.791e-03 -11.781  < 2e-16 ***
## REVOKED      -1.833e-01  1.537e-02 -11.925  < 2e-16 ***
## MVR_PTS      -2.613e-02  2.293e-03 -11.394  < 2e-16 ***
## CAR_AGE       4.430e-04  1.133e-03   0.391 0.695673    
## URBANICITY           NA         NA      NA       NA    
## HSDropout            NA         NA      NA       NA    
## Bachelors     6.048e-02  1.343e-02   4.505 6.74e-06 ***
## Masters       3.826e-02  1.994e-02   1.918 0.055084 .  
## PhD           3.269e-02  2.521e-02   1.297 0.194744    
## Panel_Truck          NA         NA      NA       NA    
## Pickup       -2.531e-02  1.344e-02  -1.883 0.059702 .  
## Sports_Car           NA         NA      NA       NA    
## Van          -3.962e-02  1.679e-02  -2.361 0.018269 *  
## SUV          -3.035e-02  1.262e-02  -2.405 0.016194 *  
## Professional -8.448e-04  1.610e-02  -0.052 0.958168    
## Blue_Collar          NA         NA      NA       NA    
## Clerical      1.754e-03  1.517e-02   0.116 0.907960    
## Doctor        3.472e-02  3.350e-02   1.036 0.300058    
## Lawyer       -2.403e-03  2.142e-02  -0.112 0.910661    
## Manager       5.729e-02  1.672e-02   3.427 0.000612 ***
## Home_Maker           NA         NA      NA       NA    
## Student       2.029e-02  1.871e-02   1.084 0.278180    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4037 on 8130 degrees of freedom
## Multiple R-squared:  0.164,  Adjusted R-squared:  0.1609 
## F-statistic: 53.15 on 30 and 8130 DF,  p-value: < 2.2e-16
plot(resid(model1))

hist(resid(model1))

qqnorm(resid(model1))
qqline(resid(model1))

Model 2

Remove those variable, having high p-values.

model2 <- lm(TARGET_FLAG ~  KIDSDRIV + PARENT1 + HOME_VAL + MSTATUS + SEX + TRAVTIME + 
    CAR_USE + BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + REVOKED + MVR_PTS + Bachelors + 
    Masters + PhD + Manager, data = train_clean)

summary(model2)
## 
## Call:
## lm(formula = TARGET_FLAG ~ KIDSDRIV + PARENT1 + HOME_VAL + MSTATUS + 
##     SEX + TRAVTIME + CAR_USE + BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + 
##     REVOKED + MVR_PTS + Bachelors + Masters + PhD + Manager, 
##     data = train_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0631 -0.3249  0.1376  0.2768  0.9441 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.126e-01  1.844e-02  38.641  < 2e-16 ***
## KIDSDRIV    -5.663e-02  9.059e-03  -6.251 4.28e-10 ***
## PARENT1     -8.938e-02  1.568e-02  -5.702 1.23e-08 ***
## HOME_VAL     2.092e-07  4.454e-08   4.697 2.68e-06 ***
## MSTATUS      5.382e-02  1.182e-02   4.553 5.36e-06 ***
## SEX          3.554e-02  9.431e-03   3.768 0.000166 ***
## TRAVTIME    -9.712e-04  2.827e-04  -3.436 0.000594 ***
## CAR_USE     -1.307e-01  1.021e-02 -12.793  < 2e-16 ***
## BLUEBOOK     4.241e-06  5.852e-07   7.246 4.69e-13 ***
## TIF          7.414e-03  1.080e-03   6.863 7.22e-12 ***
## OLDCLAIM     2.407e-06  6.599e-07   3.648 0.000266 ***
## CLM_FREQ    -5.648e-02  4.790e-03 -11.792  < 2e-16 ***
## REVOKED     -1.847e-01  1.535e-02 -12.028  < 2e-16 ***
## MVR_PTS     -2.648e-02  2.289e-03 -11.565  < 2e-16 ***
## Bachelors    6.690e-02  1.145e-02   5.842 5.36e-09 ***
## Masters      5.064e-02  1.304e-02   3.883 0.000104 ***
## PhD          6.448e-02  1.798e-02   3.587 0.000336 ***
## Manager      5.881e-02  1.425e-02   4.127 3.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4039 on 8143 degrees of freedom
## Multiple R-squared:  0.1618, Adjusted R-squared:  0.1601 
## F-statistic: 92.47 on 17 and 8143 DF,  p-value: < 2.2e-16
plot(resid(model2))

hist(resid(model2))

qqnorm(resid(model2))
qqline(resid(model2))

Model 3

Select some variable which has more imact on car crash. They are INCOME (high income people more carefully drive), MSTATUS(married people drive more safely). We elemintaes thses variables and using those variable where the car crash probability high.

model3 <- lm(TARGET_FLAG ~  KIDSDRIV + SEX + EDUCATION + TRAVTIME + TIF +CAR_TYPE + RED_CAR + OLDCLAIM +
               CLM_FREQ + REVOKED + MVR_PTS, data = train_clean)

summary(model3)
## 
## Call:
## lm(formula = TARGET_FLAG ~ KIDSDRIV + SEX + EDUCATION + TRAVTIME + 
##     TIF + CAR_TYPE + RED_CAR + OLDCLAIM + CLM_FREQ + REVOKED + 
##     MVR_PTS, data = train_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0417 -0.3630  0.1499  0.2754  0.8706 
## 
## Coefficients: (1 not defined because of singularities)
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             9.097e-01  2.087e-02  43.586  < 2e-16 ***
## KIDSDRIV               -6.524e-02  8.955e-03  -7.285 3.52e-13 ***
## SEX                    -2.839e-02  1.307e-02  -2.172 0.029889 *  
## EDUCATIONBachelors      7.536e-02  1.476e-02   5.104 3.39e-07 ***
## EDUCATIONMasters        9.924e-02  1.574e-02   6.305 3.04e-10 ***
## EDUCATIONPhD            1.247e-01  1.961e-02   6.359 2.14e-10 ***
## EDUCATIONz_High_School -1.916e-02  1.462e-02  -1.311 0.189875    
## TRAVTIME               -1.060e-03  2.869e-04  -3.694 0.000222 ***
## TIF                     7.410e-03  1.100e-03   6.738 1.72e-11 ***
## CAR_TYPEPanel_Truck    -9.300e-02  1.871e-02  -4.970 6.82e-07 ***
## CAR_TYPEPickup         -1.210e-01  1.424e-02  -8.494  < 2e-16 ***
## CAR_TYPESports_Car     -1.546e-01  1.835e-02  -8.422  < 2e-16 ***
## CAR_TYPEVan            -9.296e-02  1.773e-02  -5.244 1.61e-07 ***
## CAR_TYPEz_SUV          -1.192e-01  1.485e-02  -8.027 1.13e-15 ***
## RED_CAR                        NA         NA      NA       NA    
## OLDCLAIM                2.599e-06  6.721e-07   3.867 0.000111 ***
## CLM_FREQ               -6.142e-02  4.867e-03 -12.621  < 2e-16 ***
## REVOKED                -1.972e-01  1.562e-02 -12.630  < 2e-16 ***
## MVR_PTS                -2.924e-02  2.324e-03 -12.584  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4112 on 8143 degrees of freedom
## Multiple R-squared:  0.1314, Adjusted R-squared:  0.1296 
## F-statistic: 72.49 on 17 and 8143 DF,  p-value: < 2.2e-16

Binary Logistic Regression

Model 4

All of the variables will be tested to determine the base model they provided. This will allow us to see which variables are significant in our dataset, and allow us to make other models based on that.

model4 <- glm(TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + PARENT1 + HOME_VAL + MSTATUS + SEX + 
               TRAVTIME + CAR_USE + BLUEBOOK + TIF + RED_CAR + OLDCLAIM + CLM_FREQ + REVOKED + MVR_PTS + CAR_AGE + URBANICITY +
               HSDropout + Bachelors + Masters + PhD + Panel_Truck + Pickup + Sports_Car + Van + SUV + Professional + Blue_Collar +
               Clerical + Doctor + Lawyer + Manager + Home_Maker + Student, data = train_clean, family = 'binomial')

summary(model4)
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + 
##     INCOME + PARENT1 + HOME_VAL + MSTATUS + SEX + TRAVTIME + 
##     CAR_USE + BLUEBOOK + TIF + RED_CAR + OLDCLAIM + CLM_FREQ + 
##     REVOKED + MVR_PTS + CAR_AGE + URBANICITY + HSDropout + Bachelors + 
##     Masters + PhD + Panel_Truck + Pickup + Sports_Car + Van + 
##     SUV + Professional + Blue_Collar + Clerical + Doctor + Lawyer + 
##     Manager + Home_Maker + Student, family = "binomial", data = train_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6573  -0.7660   0.5133   0.7500   2.2621  
## 
## Coefficients: (7 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   8.629e-01  2.177e-01   3.964 7.38e-05 ***
## KIDSDRIV     -2.882e-01  5.700e-02  -5.057 4.26e-07 ***
## AGE           1.451e-03  3.803e-03   0.382 0.702749    
## HOMEKIDS     -4.245e-02  3.499e-02  -1.213 0.225000    
## YOJ           5.773e-03  7.661e-03   0.753 0.451150    
## INCOME        2.704e-06  9.897e-07   2.732 0.006291 ** 
## PARENT1      -3.252e-01  1.033e-01  -3.147 0.001651 ** 
## HOME_VAL      1.260e-06  3.305e-07   3.812 0.000138 ***
## MSTATUS       3.877e-01  7.954e-02   4.874 1.09e-06 ***
## SEX           1.578e-01  7.182e-02   2.197 0.028004 *  
## TRAVTIME     -6.523e-03  1.723e-03  -3.786 0.000153 ***
## CAR_USE      -7.501e-01  7.413e-02 -10.119  < 2e-16 ***
## BLUEBOOK      2.194e-05  4.102e-06   5.349 8.83e-08 ***
## TIF           4.968e-02  7.012e-03   7.084 1.40e-12 ***
## RED_CAR              NA         NA      NA       NA    
## OLDCLAIM      1.326e-05  3.774e-06   3.515 0.000440 ***
## CLM_FREQ     -3.191e-01  2.716e-02 -11.751  < 2e-16 ***
## REVOKED      -9.814e-01  8.708e-02 -11.271  < 2e-16 ***
## MVR_PTS      -1.368e-01  1.301e-02 -10.510  < 2e-16 ***
## CAR_AGE       2.382e-03  7.162e-03   0.333 0.739404    
## URBANICITY           NA         NA      NA       NA    
## HSDropout            NA         NA      NA       NA    
## Bachelors     3.323e-01  8.170e-02   4.067 4.76e-05 ***
## Masters       1.864e-01  1.260e-01   1.479 0.139186    
## PhD           1.318e-01  1.622e-01   0.812 0.416518    
## Panel_Truck          NA         NA      NA       NA    
## Pickup       -1.487e-01  8.198e-02  -1.814 0.069641 .  
## Sports_Car           NA         NA      NA       NA    
## Van          -2.643e-01  1.045e-01  -2.529 0.011440 *  
## SUV          -1.983e-01  7.789e-02  -2.546 0.010881 *  
## Professional -7.901e-03  1.002e-01  -0.079 0.937174    
## Blue_Collar          NA         NA      NA       NA    
## Clerical      9.555e-03  8.924e-02   0.107 0.914733    
## Doctor        4.592e-01  2.534e-01   1.812 0.069967 .  
## Lawyer        9.155e-03  1.398e-01   0.065 0.947784    
## Manager       5.019e-01  1.163e-01   4.316 1.59e-05 ***
## Home_Maker           NA         NA      NA       NA    
## Student       2.095e-01  1.077e-01   1.946 0.051693 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 8014.6  on 8130  degrees of freedom
## AIC: 8076.6
## 
## Number of Fisher Scoring iterations: 5
par(mfrow = c(2,2))
plot(model4)

Model 5

Variables will be removed one by one to determine best fit model. After each variable is removed, the model will be ‘ran’ again - until the most optimal output are produced.

model5 <- glm(TARGET_FLAG ~ KIDSDRIV + INCOME + PARENT1 + HOME_VAL + MSTATUS + SEX + 
               TRAVTIME + CAR_USE + BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + REVOKED + MVR_PTS +Bachelors + 
                Masters + PhD + Van + SUV + Professional +
               Clerical + Doctor + Lawyer + Manager  + Student, data = train_clean, family = 'binomial')

summary(model5)
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + PARENT1 + HOME_VAL + 
##     MSTATUS + SEX + TRAVTIME + CAR_USE + BLUEBOOK + TIF + OLDCLAIM + 
##     CLM_FREQ + REVOKED + MVR_PTS + Bachelors + Masters + PhD + 
##     Van + SUV + Professional + Clerical + Doctor + Lawyer + Manager + 
##     Student, family = "binomial", data = train_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6524  -0.7674   0.5130   0.7503   2.2616  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   9.222e-01  1.322e-01   6.973 3.10e-12 ***
## KIDSDRIV     -3.167e-01  5.130e-02  -6.173 6.68e-10 ***
## INCOME        2.851e-06  9.674e-07   2.947 0.003211 ** 
## PARENT1      -3.998e-01  8.862e-02  -4.511 6.45e-06 ***
## HOME_VAL      1.299e-06  3.294e-07   3.945 7.99e-05 ***
## MSTATUS       3.612e-01  7.553e-02   4.782 1.74e-06 ***
## SEX           1.567e-01  7.126e-02   2.200 0.027830 *  
## TRAVTIME     -6.392e-03  1.721e-03  -3.714 0.000204 ***
## CAR_USE      -7.798e-01  7.150e-02 -10.907  < 2e-16 ***
## BLUEBOOK      2.398e-05  3.986e-06   6.015 1.80e-09 ***
## TIF           4.961e-02  7.010e-03   7.077 1.47e-12 ***
## OLDCLAIM      1.345e-05  3.769e-06   3.567 0.000361 ***
## CLM_FREQ     -3.183e-01  2.714e-02 -11.729  < 2e-16 ***
## REVOKED      -9.873e-01  8.696e-02 -11.354  < 2e-16 ***
## MVR_PTS      -1.375e-01  1.299e-02 -10.584  < 2e-16 ***
## Bachelors     3.450e-01  7.540e-02   4.575 4.77e-06 ***
## Masters       2.064e-01  1.070e-01   1.929 0.053751 .  
## PhD           1.441e-01  1.477e-01   0.975 0.329422    
## Van          -2.224e-01  1.010e-01  -2.201 0.027726 *  
## SUV          -1.593e-01  7.489e-02  -2.127 0.033438 *  
## Professional -8.608e-03  9.945e-02  -0.087 0.931021    
## Clerical     -3.901e-03  8.760e-02  -0.045 0.964481    
## Doctor        4.731e-01  2.531e-01   1.869 0.061575 .  
## Lawyer        2.196e-02  1.388e-01   0.158 0.874286    
## Manager       5.029e-01  1.154e-01   4.357 1.32e-05 ***
## Student       1.812e-01  1.050e-01   1.725 0.084492 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 8020.7  on 8135  degrees of freedom
## AIC: 8072.7
## 
## Number of Fisher Scoring iterations: 5
par(mfrow = c(2,2))
plot(model5)

Model 6

dropterm from MASS package automatically test all models that differ from the current model by the dropping of one single term. This is done respecting marginality, so it doesn’t try models in which one main effect is dopped if the same predictor is also present in any interaction.

dropterm(model4, test = "F")
## Single term deletions
## 
## Model:
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + PARENT1 + 
##     HOME_VAL + MSTATUS + SEX + TRAVTIME + CAR_USE + BLUEBOOK + 
##     TIF + RED_CAR + OLDCLAIM + CLM_FREQ + REVOKED + MVR_PTS + 
##     CAR_AGE + URBANICITY + HSDropout + Bachelors + Masters + 
##     PhD + Panel_Truck + Pickup + Sports_Car + Van + SUV + Professional + 
##     Blue_Collar + Clerical + Doctor + Lawyer + Manager + Home_Maker + 
##     Student
##              Df Deviance    AIC  F value     Pr(F)    
## <none>            8014.6 8076.6                       
## KIDSDRIV      1   8040.1 8100.1  25.8373 3.797e-07 ***
## AGE           1   8014.8 8074.8   0.1477 0.7007058    
## HOMEKIDS      1   8016.1 8076.1   1.4863 0.2228271    
## YOJ           1   8015.2 8075.2   0.5753 0.4481925    
## INCOME        1   8022.2 8082.2   7.6554 0.0056730 ** 
## PARENT1       1   8024.6 8084.6  10.0521 0.0015274 ** 
## HOME_VAL      1   8029.2 8089.2  14.8114 0.0001197 ***
## MSTATUS       1   8038.1 8098.1  23.8247 1.075e-06 ***
## SEX           1   8019.5 8079.5   4.8912 0.0270212 *  
## TRAVTIME      1   8028.9 8088.9  14.5012 0.0001411 ***
## CAR_USE       1   8118.9 8178.9 105.7263 < 2.2e-16 ***
## BLUEBOOK      1   8043.7 8103.7  29.4927 5.775e-08 ***
## TIF           1   8066.6 8126.6  52.6722 4.305e-13 ***
## RED_CAR       0   8014.6 8076.6                       
## OLDCLAIM      1   8027.3 8087.3  12.7865 0.0003511 ***
## CLM_FREQ      1   8150.2 8210.2 137.4748 < 2.2e-16 ***
## REVOKED       1   8138.3 8198.3 125.4367 < 2.2e-16 ***
## MVR_PTS       1   8125.9 8185.9 112.8907 < 2.2e-16 ***
## CAR_AGE       1   8014.8 8074.8   0.1122 0.7376569    
## URBANICITY    0   8014.6 8076.6                       
## HSDropout     0   8014.6 8076.6                       
## Bachelors     1   8031.3 8091.3  16.9165 3.945e-05 ***
## Masters       1   8016.8 8076.8   2.2310 0.1353092    
## PhD           1   8015.3 8075.3   0.6744 0.4115384    
## Panel_Truck   0   8014.6 8076.6                       
## Pickup        1   8017.9 8077.9   3.3184 0.0685432 .  
## Sports_Car    0   8014.6 8076.6                       
## Van           1   8020.9 8080.9   6.3871 0.0115144 *  
## SUV           1   8021.1 8081.1   6.5899 0.0102736 *  
## Professional  0   8014.6 8076.6                       
## Blue_Collar   0   8014.6 8076.6                       
## Clerical      1   8014.7 8074.7   0.0116 0.9141175    
## Doctor        1   8018.1 8078.1   3.4662 0.0626718 .  
## Lawyer        1   8014.7 8074.7   0.0044 0.9474070    
## Manager       1   8034.1 8094.1  19.7215 9.077e-06 ***
## Home_Maker    0   8014.6 8076.6                       
## Student       1   8018.5 8078.5   3.8630 0.0493949 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model6 <- glm(TARGET_FLAG ~ KIDSDRIV + INCOME + PARENT1 + HOME_VAL + MSTATUS + 
               TRAVTIME + CAR_USE + BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + REVOKED + MVR_PTS + Bachelors + 
                Van + SUV +  Manager, data = train_clean, family=binomial(link="logit"))

summary(model6)
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + PARENT1 + HOME_VAL + 
##     MSTATUS + TRAVTIME + CAR_USE + BLUEBOOK + TIF + OLDCLAIM + 
##     CLM_FREQ + REVOKED + MVR_PTS + Bachelors + Van + SUV + Manager, 
##     family = binomial(link = "logit"), data = train_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6453  -0.7687   0.5169   0.7489   2.2721  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.107e+00  1.151e-01   9.623  < 2e-16 ***
## KIDSDRIV    -3.234e-01  5.122e-02  -6.312 2.75e-10 ***
## INCOME       3.901e-06  8.359e-07   4.668 3.05e-06 ***
## PARENT1     -4.304e-01  8.800e-02  -4.891 1.00e-06 ***
## HOME_VAL     1.188e-06  3.175e-07   3.741 0.000183 ***
## MSTATUS      3.540e-01  7.437e-02   4.760 1.94e-06 ***
## TRAVTIME    -6.536e-03  1.717e-03  -3.806 0.000141 ***
## CAR_USE     -7.874e-01  5.967e-02 -13.196  < 2e-16 ***
## BLUEBOOK     2.272e-05  3.893e-06   5.836 5.33e-09 ***
## TIF          4.949e-02  6.995e-03   7.075 1.50e-12 ***
## OLDCLAIM     1.307e-05  3.755e-06   3.481 0.000500 ***
## CLM_FREQ    -3.138e-01  2.702e-02 -11.613  < 2e-16 ***
## REVOKED     -9.803e-01  8.665e-02 -11.314  < 2e-16 ***
## MVR_PTS     -1.376e-01  1.296e-02 -10.613  < 2e-16 ***
## Bachelors    2.536e-01  6.437e-02   3.939 8.17e-05 ***
## Van         -1.788e-01  9.883e-02  -1.810 0.070354 .  
## SUV         -2.494e-01  6.408e-02  -3.892 9.94e-05 ***
## Manager      5.031e-01  1.053e-01   4.779 1.76e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 8041.2  on 8143  degrees of freedom
## AIC: 8077.2
## 
## Number of Fisher Scoring iterations: 4
par(mfrow = c(2,2))
plot(model6)

Display AIC and ROC of Binary Logistic Regression models

AIC <- cbind(model4$aic, model5$aic, model6$aic)
colnames(AIC) <- c("Model 4", "Model 5", "Model 6")
print(AIC)
##       Model 4 Model 5  Model 6
## [1,] 8076.646 8072.67 8077.152
pred_4 <- predict(model4,train_clean)
pred_5 <- predict(model5,train_clean)
pred_6 <- predict(model6,train_clean)

plot(roc(train_clean$TARGET_FLAG, pred_4, direction="<"),col="blue", lwd=3, main="ROC Curve")

plot(roc(train_clean$TARGET_FLAG, pred_5, direction="<"),col="blue", lwd=3, main="ROC Curve")

plot(roc(train_clean$TARGET_FLAG, pred_6, direction="<"),col="blue", lwd=3, main="ROC Curve")

Model Selection

To make prediction, we will compare various metrics for all three models. We calculate all three models’ accuracy, classification error rate, precision, sensitivity, specificity, F1 score, AUC, and confusion matrix. Even though all models yield similar metrics value, model 5 has the low AIC value. We will pick model 5 for our prediction.

Model 4 matrices:

predictedval <- predict(model4,train_clean)

cm1 <- table(true = train_clean$TARGET_FLAG, pred = round(fitted(model4)))

TN <- cm1[4]
FN <- cm1[3]
TP <- cm1[1]
FP <- cm1[2]

accuracy <- (TP + TN)/(TN + FN + TP + FP)

precision <- TP/(TP + FP)

sensitivity <- TP/(TP + FN)

specificity <- TN/(TN + FP)
  
f1_score <- 2*TP/(2*TP + FP + FN)

roc_obj <- roc(train_clean$TARGET_FLAG, predictedval)


auc <- auc(roc_obj)

df <- c(accuracy, precision, sensitivity, specificity, f1_score, auc)
names(df) <- c("Accuracy", " precision", "F1-sensitivity", 
                      "specificity", "f1_score", "AUC")

kable(df, col.names = "Values") %>%kable_paper('hover', full_width = F)
Values
Accuracy 0.7676755
precision 0.6281157
F1-sensitivity 0.2926150
specificity 0.9379161
f1_score 0.3992395
AUC 0.7582871

Model 5 matrices:

pred_5 <- predict(model5,train_clean)

cm2 <- table(true = train_clean$TARGET_FLAG, pred = round(fitted(model5)))

TN <- cm2[4]
FN <- cm2[3]
TP <- cm2[1]
FP <- cm2[2]

accuracy <- (TP + TN)/(TN + FN + TP + FP)

precision <- TP/(TP + FP)

sensitivity <- TP/(TP + FN)

specificity <- TN/(TN + FP)
  
f1_score <- 2*TP/(2*TP + FP + FN)

roc_obj <- roc(train_clean$TARGET_FLAG, predictedval)

auc <- auc(roc_obj)

df2 <- c(accuracy, precision, sensitivity, specificity, f1_score, auc)
names(df2) <- c("Accuracy", " precision", "F1-sensitivity", 
                      "specificity", "f1_score", "AUC")

kable(df2, col.names = "Values") %>%kable_paper('hover', full_width = F)
Values
Accuracy 0.7671854
precision 0.6266266
F1-sensitivity 0.2907571
specificity 0.9379161
f1_score 0.3972081
AUC 0.7582871

Model 6 matrices:

pred_6 <- predict(model6,train_clean)

cm3 <- table(true = train_clean$TARGET_FLAG, pred = round(fitted(model6)))

TN <- cm3[4]
FN <- cm3[3]
TP <- cm3[1]
FP <- cm3[2]

accuracy <- (TP + TN)/(TN + FN + TP + FP)

precision <- TP/(TP + FP)

sensitivity <- TP/(TP + FN)

specificity <- TN/(TN + FP)
  
f1_score <- 2*TP/(2*TP + FP + FN)

roc_obj <- roc(train_clean$TARGET_FLAG, predictedval)

auc <- auc(roc_obj)

df3 <- c(accuracy, precision, sensitivity, specificity, f1_score, auc)
names(df3) <- c("Accuracy", " precision", "F1-sensitivity", 
                      "specificity", "f1_score", "AUC")

kable(df3, col.names = "Values") %>%kable_paper('hover', full_width = F)
Values
Accuracy 0.7660826
precision 0.6217565
F1-sensitivity 0.2893637
specificity 0.9369174
f1_score 0.3949287
AUC 0.7582871

Combine all 3 data frame and below table display all matrices.

compar_tbl <- cbind(df, df2, df3)
colnames(compar_tbl) <- c("Model 4", "Model 5", "Model 6")
DT::datatable(compar_tbl)

Load test dataset to apply model and predict the car crash. Perform data cleaning and varaibles creation.

test =
  read.csv("https://raw.githubusercontent.com/mharrisonbaker/DATA621_GroupWork2/main/HW4/insurance-evaluation-data.csv",
           header = TRUE) %>% dplyr::select(-INDEX)

dim(test)
## [1] 2141   25

Apply Model 5 on test data to make the prediction

TARGET_FLAG <- predict(model5, newdata = test_clean)
# Classifying the evaluate dataset into claim = 0 or 1
y_pred_num <- ifelse(TARGET_FLAG > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
summary(y_pred)
##    0    1 
##  482 1659

This model predicts that 1659 insurance customers would have an auto accident, while 482 will not.