Overview In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero. Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:

install.packages(‘pander’)

library(readr)
library(kableExtra)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.5
## v tidyr   0.8.1     v stringr 1.3.1
## v ggplot2 2.2.1     v forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(usdm)
## Loading required package: sp
## Loading required package: raster
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:tidyr':
## 
##     extract
library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(ggiraph)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(corrgram)
library(caTools)
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(reshape2)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(qqplotr)
library(moments)
library(car)
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:usdm':
## 
##     vif
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.1
## 
## Attaching package: 'MASS'
## The following objects are masked from 'package:raster':
## 
##     area, select
## The following object is masked from 'package:dplyr':
## 
##     select
library(geoR)
## Warning: package 'geoR' was built under R version 3.5.1
## --------------------------------------------------------------
##  Analysis of Geostatistical Data
##  For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
##  geoR version 1.7-5.2 (built on 2016-05-02) is now loaded
## --------------------------------------------------------------
library(pander)
## Warning: package 'pander' was built under R version 3.5.1

DATA EXPLORATION:

The dataset of interest contains information about customers of an auto insurance company. The dataset has 8161 rows (each representing a customer) and 25 variables. There are 23 predictor variables and 2 response variables: TARGET_FLAG, a binary categorical variable representing whether each customer has been in an accident; and TARGET_AMT, a numerical variable indicating the cost of a crash that a customer was in.

ins_train <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-4/master/insurance_training_data.csv") 
summary(ins_train)
##      INDEX        TARGET_FLAG       TARGET_AMT        KIDSDRIV     
##  Min.   :    1   Min.   :0.0000   Min.   :     0   Min.   :0.0000  
##  1st Qu.: 2559   1st Qu.:0.0000   1st Qu.:     0   1st Qu.:0.0000  
##  Median : 5133   Median :0.0000   Median :     0   Median :0.0000  
##  Mean   : 5152   Mean   :0.2638   Mean   :  1504   Mean   :0.1711  
##  3rd Qu.: 7745   3rd Qu.:1.0000   3rd Qu.:  1036   3rd Qu.:0.0000  
##  Max.   :10302   Max.   :1.0000   Max.   :107586   Max.   :4.0000  
##                                                                    
##       AGE           HOMEKIDS           YOJ            INCOME    
##  Min.   :16.00   Min.   :0.0000   Min.   : 0.0   $0      : 615  
##  1st Qu.:39.00   1st Qu.:0.0000   1st Qu.: 9.0           : 445  
##  Median :45.00   Median :0.0000   Median :11.0   $26,840 :   4  
##  Mean   :44.79   Mean   :0.7212   Mean   :10.5   $48,509 :   4  
##  3rd Qu.:51.00   3rd Qu.:1.0000   3rd Qu.:13.0   $61,790 :   4  
##  Max.   :81.00   Max.   :5.0000   Max.   :23.0   $107,375:   3  
##  NA's   :6                        NA's   :454    (Other) :7086  
##  PARENT1        HOME_VAL    MSTATUS      SEX               EDUCATION   
##  No :7084   $0      :2294   Yes :4894   M  :3786   <High School :1203  
##  Yes:1077           : 464   z_No:3267   z_F:4375   Bachelors    :2242  
##             $111,129:   3                          Masters      :1658  
##             $115,249:   3                          PhD          : 728  
##             $123,109:   3                          z_High School:2330  
##             $153,061:   3                                              
##             (Other) :5391                                              
##             JOB          TRAVTIME            CAR_USE        BLUEBOOK   
##  z_Blue Collar:1825   Min.   :  5.00   Commercial:3029   $1,500 : 157  
##  Clerical     :1271   1st Qu.: 22.00   Private   :5132   $6,000 :  34  
##  Professional :1117   Median : 33.00                     $5,800 :  33  
##  Manager      : 988   Mean   : 33.49                     $6,200 :  33  
##  Lawyer       : 835   3rd Qu.: 44.00                     $6,400 :  31  
##  Student      : 712   Max.   :142.00                     $5,900 :  30  
##  (Other)      :1413                                      (Other):7843  
##       TIF                CAR_TYPE    RED_CAR       OLDCLAIM   
##  Min.   : 1.000   Minivan    :2145   no :5783   $0     :5009  
##  1st Qu.: 1.000   Panel Truck: 676   yes:2378   $1,310 :   4  
##  Median : 4.000   Pickup     :1389              $1,391 :   4  
##  Mean   : 5.351   Sports Car : 907              $4,263 :   4  
##  3rd Qu.: 7.000   Van        : 750              $1,105 :   3  
##  Max.   :25.000   z_SUV      :2294              $1,332 :   3  
##                                                 (Other):3134  
##     CLM_FREQ      REVOKED       MVR_PTS          CAR_AGE      
##  Min.   :0.0000   No :7161   Min.   : 0.000   Min.   :-3.000  
##  1st Qu.:0.0000   Yes:1000   1st Qu.: 0.000   1st Qu.: 1.000  
##  Median :0.0000              Median : 1.000   Median : 8.000  
##  Mean   :0.7986              Mean   : 1.696   Mean   : 8.328  
##  3rd Qu.:2.0000              3rd Qu.: 3.000   3rd Qu.:12.000  
##  Max.   :5.0000              Max.   :13.000   Max.   :28.000  
##                                               NA's   :510     
##                  URBANICITY  
##  Highly Urban/ Urban  :6492  
##  z_Highly Rural/ Rural:1669  
##                              
##                              
##                              
##                              
## 
var_class <- data.frame(Class = rep(NA, ncol(ins_train) - 1), Levels = rep(NA, ncol(ins_train) - 1), stringsAsFactors = FALSE, check.names = FALSE, row.names = names(ins_train)[-1])
for(i in 2:ncol(ins_train)) {
  var_class[i - 1, 1] <- class(ins_train[, i])
  var_class[i - 1, 2] <- ifelse(length(levels(ins_train[, i])) == 0, '-', length(levels(ins_train[, i])))
}
pander(var_class)
  Class Levels
TARGET_FLAG integer -
TARGET_AMT numeric -
KIDSDRIV integer -
AGE integer -
HOMEKIDS integer -
YOJ integer -
INCOME factor 6613
PARENT1 factor 2
HOME_VAL factor 5107
MSTATUS factor 2
SEX factor 2
EDUCATION factor 5
JOB factor 9
TRAVTIME integer -
CAR_USE factor 2
BLUEBOOK factor 2789
TIF integer -
CAR_TYPE factor 6
RED_CAR factor 2
OLDCLAIM factor 2857
CLM_FREQ integer -
REVOKED factor 2
MVR_PTS integer -
CAR_AGE integer -
URBANICITY factor 2

INCOME, HOME_VAL, BLUEBOOK, and OLDCLAIM are represented as strings. So we will be extracting the numeric values for these.

ins_train$INCOME <- as.numeric(str_replace_all(ins_train$INCOME, "[[:punct:]\\$]",""))
ins_train$HOME_VAL <- as.numeric(str_replace_all(ins_train$HOME_VAL, "[[:punct:]\\$]",""))
ins_train$BLUEBOOK <- as.numeric(str_replace_all(ins_train$BLUEBOOK, "[[:punct:]\\$]",""))
ins_train$OLDCLAIM <- as.numeric(str_replace_all(ins_train$OLDCLAIM, "[[:punct:]\\$]",""))

Visual Exploration:

Boxplots are generated for non-binary variables split by TARGET_FLAG:

numeric <- ins_train %>% dplyr::select(c(TARGET_FLAG, TARGET_AMT, KIDSDRIV, AGE, HOMEKIDS, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK, TIF, OLDCLAIM, CLM_FREQ, MVR_PTS, CAR_AGE))

numeric <- melt(numeric, id.vars="TARGET_FLAG")
numeric$TARGET_FLAG <- factor(numeric$TARGET_FLAG)
ggplot(numeric, aes(TARGET_FLAG, value)) + geom_boxplot(aes(fill = TARGET_FLAG), alpha = 0.5) + facet_wrap(~variable, scale="free") + scale_fill_discrete(guide = FALSE) + scale_y_continuous('', labels = NULL, breaks = NULL) + scale_x_discrete('') + ggtitle("Distribution of Predictors by TARGET_FLAG\n")
## Warning: Removed 1879 rows containing non-finite values (stat_boxplot).

Now lets see the correlations:

pairs(~MVR_PTS+CLM_FREQ+URBANICITY+HOME_VAL+PARENT1+CAR_USE+OLDCLAIM, data=ins_train, main="Predictors with High Correlattions to Targets", col="slategrey")

Now we will see the missing values in the dataset. For this i have used Amelia package. We can see there are missing values for CAR_AGE, HOME_VAL, YOJ and INCOME. There needs to be taken care while we do data preparation.

missmap(ins_train, main = "Missing values vs observed",  color='dodgerblue')

Now lets do some plots to understand the data:

AGE - Age of Driver. Very young people tend to be risky. Maybe very old people also. We note six missing values that we’ll need to address later. The distribution of AGE is almost perfectly normal. When we break out the data by TARGET_FLAG values, the distributions of age by TARGET_FLAG are still roughly normal.

with(ins_train, c(summary(AGE), SD=sd(AGE), Skew=skewness(AGE), Kurt=kurtosis(AGE)))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's       SD 
## 16.00000 39.00000 45.00000 44.79031 51.00000 81.00000  6.00000       NA 
##     Skew     Kurt 
##       NA       NA
hist <- ggplot(ins_train, aes(AGE)) + geom_histogram(fill = 'dodgerblue', binwidth = 10, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of AGE') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=AGE)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of AGE") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", AGE)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of AGE', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), AGE)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of AGE by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

BLUEBOOK - Value of Vehicle. Unknown effect on probability of collision, but probably effect the payout if there is a crash. Individuals involved in crashes have a higher proportion of low BLUEBOOK values.

with(ins_train, c(summary(BLUEBOOK), SD=sd(BLUEBOOK), Skew=skewness(BLUEBOOK), Kurt=kurtosis(BLUEBOOK)))
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## 1.500000e+03 9.280000e+03 1.444000e+04 1.570990e+04 2.085000e+04 
##         Max.           SD         Skew         Kurt 
## 6.974000e+04 8.419734e+03 7.943601e-01 3.792285e+00
hist <- ggplot(ins_train, aes(BLUEBOOK)) + geom_histogram(fill = 'dodgerblue', binwidth = 10000, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of BLUEBOOK') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=BLUEBOOK)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of BLUEBOOK") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", BLUEBOOK)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of BLUEBOOK', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), BLUEBOOK)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of BLUEBOOK by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

CAR_AGE - Vehicle Age. We could see there is one negative value for CAR_AGE. We have to treat this value in our data preparation step.

with(ins_train, c(summary(CAR_AGE), SD=sd(CAR_AGE), Skew=skewness(CAR_AGE), Kurt=kurtosis(CAR_AGE)))
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##  -3.000000   1.000000   8.000000   8.328323  12.000000  28.000000 
##       NA's         SD       Skew       Kurt 
## 510.000000         NA         NA         NA
hist <- ggplot(ins_train, aes(CAR_AGE)) + geom_histogram(fill = 'dodgerblue', binwidth = 5, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of CAR_AGE') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=CAR_AGE)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of CAR_AGE") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", CAR_AGE)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of CAR_AGE', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), CAR_AGE)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of CAR_AGE by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)
## Warning: Removed 510 rows containing non-finite values (stat_bin).
## Warning: Removed 510 rows containing non-finite values (stat_boxplot).

## Warning: Removed 510 rows containing non-finite values (stat_boxplot).

CLM_FREQ - # Claims (Past 5 Years). The more claims you filed in the past, the more you are likely to file in the future. We can see that this variable is also skewed.

with(ins_train, c(summary(CLM_FREQ), SD=sd(CLM_FREQ), Skew=skewness(CLM_FREQ), Kurt=kurtosis(CLM_FREQ)))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.        SD 
## 0.0000000 0.0000000 0.0000000 0.7985541 2.0000000 5.0000000 1.1584527 
##      Skew      Kurt 
## 1.2090207 3.2850940
hist <- ggplot(ins_train, aes(CLM_FREQ)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of CLM_FREQ') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=CLM_FREQ)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of CLM_FREQ") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", CLM_FREQ)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of CLM_FREQ', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), CLM_FREQ)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of CLM_FREQ by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

HOMEKIDS - # Children at Home. HOMEKIDS does not seem to impact the TARGET_FLAG. The distribution of this discrete variable is right skewed.

with(ins_train, c(summary(HOMEKIDS), SD=sd(HOMEKIDS), Skew=skewness(HOMEKIDS), Kurt=kurtosis(HOMEKIDS)))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.        SD 
## 0.0000000 0.0000000 0.0000000 0.7212351 1.0000000 5.0000000 1.1163233 
##      Skew      Kurt 
## 1.3413736 3.6498859
hist <- ggplot(ins_train, aes(HOMEKIDS)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of HOMEKIDS') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=HOMEKIDS)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of HOMEKIDS") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", HOMEKIDS)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of HOMEKIDS', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), HOMEKIDS)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of HOMEKIDS by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

HOME_VAL - Home Value. Home owners tend to drive more responsibly. The distribution of HOME_VAL is right skewed and also we can there are some missing values.

with(ins_train, c(summary(HOME_VAL), SD=sd(HOME_VAL), Skew=skewness(HOME_VAL), Kurt=kurtosis(HOME_VAL)))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's       SD 
##      0.0      0.0 161160.0 154867.3 238724.0 885282.0    464.0       NA 
##     Skew     Kurt 
##       NA       NA
hist <- ggplot(ins_train, aes(HOME_VAL)) + geom_histogram(fill = 'dodgerblue', binwidth = 100000, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of HOME_VAL') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=HOME_VAL)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of HOME_VAL") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", HOME_VAL)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of HOME_VAL', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), HOME_VAL)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of HOME_VAL by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)
## Warning: Removed 464 rows containing non-finite values (stat_bin).
## Warning: Removed 464 rows containing non-finite values (stat_boxplot).

## Warning: Removed 464 rows containing non-finite values (stat_boxplot).

INCOME - Income of the person. Rich people tend to get into fewer crashes. The distribution of INCOME is right skewed, with a significant number of observations indicating $0 in income. There are some missing values in this aswell.

with(ins_train, c(summary(INCOME), SD=sd(INCOME), Skew=skewness(INCOME), Kurt=kurtosis(INCOME)))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##      0.00  28097.00  54028.00  61898.09  85986.00 367030.00    445.00 
##        SD      Skew      Kurt 
##        NA        NA        NA
hist <- ggplot(ins_train, aes(INCOME)) + geom_histogram(fill = 'dodgerblue', binwidth = 10000, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of INCOME') + theme(plot.title = element_text(hjust = 1)) 

qq_plot <- ggplot(ins_train, aes(sample=INCOME)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of INCOME") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", INCOME)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of INCOME', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), INCOME)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of INCOME by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)
## Warning: Removed 445 rows containing non-finite values (stat_bin).
## Warning: Removed 445 rows containing non-finite values (stat_boxplot).

## Warning: Removed 445 rows containing non-finite values (stat_boxplot).

KIDSDRIV - # Driving Children. When teenagers drive your car, you are more likely to get into crashes. The discrete variable KIDSDRIV is right skewed

with(ins_train, c(summary(KIDSDRIV), SD=sd(KIDSDRIV), Skew=skewness(KIDSDRIV), Kurt=kurtosis(KIDSDRIV)))
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##  0.0000000  0.0000000  0.0000000  0.1710575  0.0000000  4.0000000 
##         SD       Skew       Kurt 
##  0.5115341  3.3524536 14.7838144
hist <- ggplot(ins_train, aes(KIDSDRIV)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of KIDSDRIV') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=KIDSDRIV)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of KIDSDRIV") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", KIDSDRIV)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of KIDSDRIV', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), KIDSDRIV)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of KIDSDRIV by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

MVR_PTS - Motor Vehicle Record Points. If you get lots of traffic tickets, you tend to get into more crashes. MVR_PTS is positively skewed.

with(ins_train, c(summary(MVR_PTS), SD=sd(MVR_PTS), Skew=skewness(MVR_PTS), Kurt=kurtosis(MVR_PTS)))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.        SD 
##  0.000000  0.000000  1.000000  1.695503  3.000000 13.000000  2.147112 
##      Skew      Kurt 
##  1.348088  4.376562
hist <- ggplot(ins_train, aes(MVR_PTS)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of MVR_PTS') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=MVR_PTS)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of MVR_PTS") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", MVR_PTS)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of MVR_PTS', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), MVR_PTS)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of MVR_PTS by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

OLDCLAIM - Total Claims (Past 5 Years). If your total payout over the past five years was high, this suggests future payouts will be high. The distribution of OLDCLAIM is extremely right skewed.

with(ins_train, c(summary(OLDCLAIM), SD=sd(OLDCLAIM), Skew=skewness(OLDCLAIM), Kurt=kurtosis(OLDCLAIM)))
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
##     0.000000     0.000000     0.000000  4037.076216  4636.000000 
##         Max.           SD         Skew         Kurt 
## 57037.000000  8777.139104     3.119613    12.863811
hist <- ggplot(ins_train, aes(OLDCLAIM)) + geom_histogram(fill = 'dodgerblue', binwidth = 10000, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of OLDCLAIM') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=OLDCLAIM)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of OLDCLAIM") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", OLDCLAIM)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of OLDCLAIM', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), OLDCLAIM)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of OLDCLAIM by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

TIF - Time in Force. People who have been customers for a long time are usually more safe. The distribution is somewhat positively skewed.

with(ins_train, c(summary(TIF), SD=sd(TIF), Skew=skewness(TIF), Kurt=kurtosis(TIF)))
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##  1.0000000  1.0000000  4.0000000  5.3513050  7.0000000 25.0000000 
##         SD       Skew       Kurt 
##  4.1466353  0.8909758  3.4233329
hist <- ggplot(ins_train, aes(TIF)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of TIF') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=TIF)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of TIF") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", TIF)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of TIF', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), TIF)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of TIF by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

TRAVTIME - Distance to Work. Long drives to work usually suggest greater risk. The distribution has a slight positive skew. The subset of insureds with no accidents have a higher proportion of individuals with short commute times.

with(ins_train, c(summary(TRAVTIME), SD=sd(TRAVTIME), Skew=skewness(TRAVTIME), Kurt=kurtosis(TRAVTIME)))
##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
##   5.0000000  22.0000000  33.0000000  33.4857248  44.0000000 142.0000000 
##          SD        Skew        Kurt 
##  15.9083334   0.4468995   3.6652313
hist <- ggplot(ins_train, aes(TRAVTIME)) + geom_histogram(fill = 'dodgerblue', binwidth = 10, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of TRAVTIME') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=TRAVTIME)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of TRAVTIME") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", TRAVTIME)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of TRAVTIME', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), TRAVTIME)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of TRAVTIME by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)

YOJ - Years on Job. People who stay at a job for a long time are usually more safe. The variable would be approximately normally distributed if it weren’t for the high percentage of individuals with less than one year on the job.

with(ins_train, c(summary(YOJ), SD=sd(YOJ), Skew=skewness(YOJ), Kurt=kurtosis(YOJ)))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##   0.00000   9.00000  11.00000  10.49929  13.00000  23.00000 454.00000 
##        SD      Skew      Kurt 
##        NA        NA        NA
hist <- ggplot(ins_train, aes(YOJ)) + geom_histogram(fill = 'dodgerblue', binwidth = 5, color = 'darkgray' ) + 
 theme_classic() + labs(title = 'Histogram of YOJ') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(ins_train, aes(sample=YOJ)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of YOJ") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5)) 

box_plot <- ggplot(ins_train, aes(x="", YOJ)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of YOJ', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_target <- ggplot(ins_train, aes(x=factor(TARGET_FLAG), YOJ)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
  labs(x='target', title = 'Boxplot of YOJ by TARGET_FLAG') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_target, ncol=2)
## Warning: Removed 454 rows containing non-finite values (stat_bin).
## Warning: Removed 454 rows containing non-finite values (stat_boxplot).

## Warning: Removed 454 rows containing non-finite values (stat_boxplot).

EDUCATION - Unknown effect, but in theory more educated people tend to drive more safely.

options(width=100)
tbl <- with(ins_train, rbind(addmargins(table(EDUCATION)),addmargins(prop.table(table(EDUCATION)))*100))
## Warning in if (attr(list(...)[[1]], "class") == "mids") return(rbind.mids(...)) else
## return(base::rbind(...)): the condition has length > 1 and only the first element will be used
row.names(tbl) <- c('count','percent')
round(tbl,1)
##         <High School Bachelors Masters   PhD z_High School  Sum
## count         1203.0    2242.0  1658.0 728.0        2330.0 8161
## percent         14.7      27.5    20.3   8.9          28.6  100

REVOKED - License Revoked (Past 7 Years). If your license was revoked in the past 7 years, you probably are a more risky driver. Only 12% of drivers in the training data have a former license suspension on record.

tbl <- addmargins(table(REVOKED=ins_train$REVOKED,TARGET_FLAG=ins_train$TARGET_FLAG))
tbl
##        TARGET_FLAG
## REVOKED    0    1  Sum
##     No  5451 1710 7161
##     Yes  557  443 1000
##     Sum 6008 2153 8161

RED_CAR - A Red Car. Urban legend says that red cars (especially red sports cars) are more risky. Is that true?. 30% of vehicles in the red category.

tbl <- addmargins(table(RED_CAR=ins_train$RED_CAR,TARGET_FLAG=ins_train$TARGET_FLAG))
tbl
##        TARGET_FLAG
## RED_CAR    0    1  Sum
##     no  4246 1537 5783
##     yes 1762  616 2378
##     Sum 6008 2153 8161

CAR_USE - Vehicle Use. Commercial vehicles are driven more, so might increase probability of collision. 60% car usage is private.

tbl <- addmargins(table(CAR_USE=ins_train$CAR_USE,TARGET_FLAG=ins_train$TARGET_FLAG))
tbl
##             TARGET_FLAG
## CAR_USE         0    1  Sum
##   Commercial 1982 1047 3029
##   Private    4026 1106 5132
##   Sum        6008 2153 8161

SEX - Gender. Urban legend says that women have less crashes then men. Is that true?. The split between males and females is split almost 50/50.

tbl <- addmargins(table(SEX=ins_train$SEX,TARGET_FLAG=ins_train$TARGET_FLAG))
tbl
##      TARGET_FLAG
## SEX      0    1  Sum
##   M   2825  961 3786
##   z_F 3183 1192 4375
##   Sum 6008 2153 8161
round(prop.table(tbl[1:2,1:2], margin=1),2)
##      TARGET_FLAG
## SEX      0    1
##   M   0.75 0.25
##   z_F 0.73 0.27
prop.test(tbl[1:2,1:2])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  tbl[1:2, 1:2]
## X-squared = 3.5307, df = 1, p-value = 0.06024
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0007561151  0.0380106016
## sample estimates:
##    prop 1    prop 2 
## 0.7461701 0.7275429

MSTATUS - Marital Status. In theory, married people drive more safely. There is a fairly balanced split (60/40) between married and single insureds.

tbl <- addmargins(table(MSTATUS=ins_train$MSTATUS,TARGET_FLAG=ins_train$TARGET_FLAG))
tbl
##        TARGET_FLAG
## MSTATUS    0    1  Sum
##    Yes  3841 1053 4894
##    z_No 2167 1100 3267
##    Sum  6008 2153 8161
round(prop.table(tbl[1:2,1:2], margin=1),2)
##        TARGET_FLAG
## MSTATUS    0    1
##    Yes  0.78 0.22
##    z_No 0.66 0.34
prop.test(tbl[1:2,1:2])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  tbl[1:2, 1:2]
## X-squared = 148.38, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1014053 0.1416726
## sample estimates:
##    prop 1    prop 2 
## 0.7848386 0.6632997

PARENT1 - Single Parent. The is a 20% difference in the calculated proportions. This difference is statistically significant:

tbl <- addmargins(table(PARENT1=ins_train$PARENT1,TARGET_FLAG=ins_train$TARGET_FLAG))
tbl
##        TARGET_FLAG
## PARENT1    0    1  Sum
##     No  5407 1677 7084
##     Yes  601  476 1077
##     Sum 6008 2153 8161
round(prop.table(tbl[1:2,1:2], margin=1),2)
##        TARGET_FLAG
## PARENT1    0    1
##     No  0.76 0.24
##     Yes 0.56 0.44
prop.test(tbl[1:2,1:2])
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  tbl[1:2, 1:2]
## X-squared = 201.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.1734351 0.2370404
## sample estimates:
##    prop 1    prop 2 
## 0.7632693 0.5580316

CAR_TYPE. Type of Car. We can see sports cars are having the highest proportion of accidents, and minivan have the lowest.

tbl <- with(ins_train, addmargins(table(CAR_TYPE, TARGET_FLAG)))
tbl
##              TARGET_FLAG
## CAR_TYPE         0    1  Sum
##   Minivan     1796  349 2145
##   Panel Truck  498  178  676
##   Pickup       946  443 1389
##   Sports Car   603  304  907
##   Van          549  201  750
##   z_SUV       1616  678 2294
##   Sum         6008 2153 8161
pt <- round(prop.table(tbl[1:6,1:2], margin=1),2)
pt
##              TARGET_FLAG
## CAR_TYPE         0    1
##   Minivan     0.84 0.16
##   Panel Truck 0.74 0.26
##   Pickup      0.68 0.32
##   Sports Car  0.66 0.34
##   Van         0.73 0.27
##   z_SUV       0.70 0.30
prop.test(tbl[1:6,1:2])
## 
##  6-sample test for equality of proportions without continuity correction
## 
## data:  tbl[1:6, 1:2]
## X-squared = 170.38, df = 5, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3    prop 4    prop 5    prop 6 
## 0.8372960 0.7366864 0.6810655 0.6648291 0.7320000 0.7044464

TARGET Variables

TARGET_FLAG - The response variable TARGET_FLAG has a moderate imbalance, with three-quarters of the observations indicating no crashes.

tbl <- with(ins_train,rbind(round(addmargins(table(TARGET_FLAG)),0),
                       addmargins(prop.table(table(TARGET_FLAG)))*100))
## Warning in if (attr(list(...)[[1]], "class") == "mids") return(rbind.mids(...)) else
## return(base::rbind(...)): the condition has length > 1 and only the first element will be used
row.names(tbl) <- c('count','percent')
round(tbl,1)
##              0      1  Sum
## count   6008.0 2153.0 8161
## percent   73.6   26.4  100

TARGET_AMT - exhibits extreme, positive skewness and high kurtosis.

options(width=100)
round(with(ins_train, c(summary(TARGET_AMT), StdD=sd(TARGET_AMT), Skew=skewness(TARGET_AMT), Kurt=kurtosis(TARGET_AMT))),2)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      StdD      Skew      Kurt 
##      0.00      0.00      0.00   1504.32   1036.00 107586.14   4704.03      8.71    115.32
h <- ggplot(ins_train, aes(TARGET_AMT)) + 
  geom_histogram(color="ghostwhite", fill="darkgrey") +
  theme_classic()+ labs(title = 'Histogram of TARGET_AMT') + 
  theme(plot.title = element_text(hjust = 0.5),axis.title.y=element_text(size=10)) + 
  theme(legend.position = c(1,1),legend.justification  = c(1,1), legend.background = element_rect(fill='dodgerblue')) +
  scale_fill_manual("TARGET_FLAG",values=c("dodgerblue","dodgerblue")) +
  theme(plot.title = element_text(size=12),legend.title=element_text(size=8),
        legend.text=element_text(size=7),panel.background = element_rect(fill = "dodgerblue")) 

b <- ggplot(ins_train, aes(x="",y=TARGET_AMT)) + 
  geom_boxplot(color="ghostwhite", fill="steelblue4",outlier.color="darkgrey", outlier.size = 0.5) +
  theme_classic()+ labs(title = 'Boxplot of TARGET_AMT') + 
  theme(plot.title = element_text(hjust = 0.5),axis.title.y=element_text(size=10)) + 
  theme(legend.position = c(1,1),legend.justification  = c(1,1), legend.background = element_rect(fill='dodgerblue')) +
  scale_fill_manual("TARGET_FLAG",values=c("dodgerblue","dodgerblue")) +
  theme(plot.title = element_text(size=12),legend.title=element_text(size=8),
        legend.text=element_text(size=7),panel.background = element_rect(fill = "dodgerblue")) + coord_flip() + 
  stat_summary(fun.y=mean, colour="darkred", geom="point", shape=16, size=2)


grid.arrange(h,b, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

DATA PREPARATION:

There are 7 variables that have only 2 values, so we can make them binary.

PARENT1 - Convert yes to 1

MSTATUS - Convert yes to 1

RED_CAR - Convert yes to 1

REVOKED - Convert yes to 1

SEX - Convert male to 1

CAR_USE - Convert Commercial to 1

URBANICITY: Conver Highly Urban/ Urban to 1

#Convert indicator variables to 0s and 1s; 1 = Yes, Male for Sex, Commercial for Car Use, Red for RED_CAR, and Highly Urban for URBANICITY
ins_train$PARENT1 <- ifelse(ins_train$PARENT1=="Yes", 1, 0)
ins_train$MSTATUS <- ifelse(ins_train$MSTATUS=="Yes", 1, 0)
ins_train$SEX <- ifelse(ins_train$SEX=="M", 1, 0)
ins_train$CAR_USE <- ifelse(ins_train$CAR_USE=="Commercial", 1, 0)
ins_train$RED_CAR <- ifelse(ins_train$RED_CAR=="Yes", 1, 0)
ins_train$REVOKED <- ifelse(ins_train$REVOKED=="Yes", 1, 0)
ins_train$URBANICITY <- ifelse(ins_train$URBANICITY == "Highly Urban/ Urban", 1, 0)

#Convert categorical predictor values to indicator variables - EDUCATION, CAR_TYPE, JOB

#EDUCATION, High school graduate is base case
ins_train$HSDropout <- ifelse(ins_train$EDUCATION=="<High School", 1, 0)
ins_train$Bachelors <- ifelse(ins_train$EDUCATION=="Bachelors", 1, 0)
ins_train$Masters <- ifelse(ins_train$EDUCATION=="Masters", 1, 0)
ins_train$PhD <- ifelse(ins_train$EDUCATION=="PhD", 1, 0)

#CAR_TYPE, base case is minivan
ins_train$Panel_Truck <- ifelse(ins_train$CAR_TYPE=="Panel Truck", 1, 0)
ins_train$Pickup <- ifelse(ins_train$CAR_TYPE=="Pickup", 1, 0)
ins_train$Sports_Car <- ifelse(ins_train$CAR_TYPE=="Sports Car", 1, 0)
ins_train$Van <- ifelse(ins_train$CAR_TYPE=="Van", 1, 0)
ins_train$SUV <- ifelse(ins_train$CAR_TYPE=="z_SUV", 1, 0)

#JOB, base case is ""
ins_train$Professional <- ifelse(ins_train$JOB == "Professional", 1, 0)
ins_train$Blue_Collar <- ifelse(ins_train$JOB == "Professional", 1, 0)
ins_train$Clerical <- ifelse(ins_train$JOB == "Clerical", 1, 0)
ins_train$Doctor <- ifelse(ins_train$JOB == "Doctor", 1, 0)
ins_train$Lawyer <- ifelse(ins_train$JOB == "Lawyer", 1, 0)
ins_train$Manager <- ifelse(ins_train$JOB == "Manager", 1, 0)
ins_train$Home_Maker <- ifelse(ins_train$JOB == "Home Maker", 1, 0)
ins_train$Student <- ifelse(ins_train$JOB == "Student", 1, 0)

Missing/ Error Values treatment:

Due to the skewness illustrated by some of the variables with missing data, the median is used to avoid any bias introduced into the mean by the skewness of these variables’ distribution.

ins_train$CAR_AGE[ins_train$CAR_AGE == -3] <- NA

ins_train <- ins_train %>% dplyr::select(-c(INDEX,EDUCATION,CAR_TYPE,JOB))

fillwithmedian <- function(x) {
  median_val = median(x, na.rm = TRUE)
  x[is.na(x)] = median_val
  return(x)
}

ins_train <- data.frame(lapply(ins_train, fillwithmedian))

Lets look into the variables and see what transformation to use.

INCOME

Income is a positively skewed variable with a significant number zeroes. We will apply the square root transformation suggested by the box-cox procedure to the original variable to reduce the overall skew.

boxcoxfit(ins_train$INCOME[ins_train$INCOME >0])
## Fitted parameters:
##       lambda         beta      sigmasq 
##    0.4335048  265.9105875 6864.8508749 
## 
## Convergence code returned by optim: 0
ins_train$INCOME_MOD <- ins_train$INCOME ^0.433

HOME_VAL

Home values are also moderately right skewed with a significant number of zeroes. We’ll apply a quarter root transformation to the original variable to reduce the overall skew.

boxcoxfit(ins_train$HOME_VAL[ins_train$HOME_VAL > 0])
## Fitted parameters:
##     lambda       beta    sigmasq 
##  0.1134775 26.3870095  2.8759681 
## 
## Convergence code returned by optim: 0
ins_train$HOME_VAL_MOD <- ins_train$HOME_VAL^0.113

BLUEBOOK

The BLUEBOOK variable has a moderate right skew. We’ll apply the square root transformation suggested by the box-cox procedure.

boxcoxfit(ins_train$BLUEBOOK)
## Fitted parameters:
##       lambda         beta      sigmasq 
##    0.4610754  177.4257712 2217.4825612 
## 
## Convergence code returned by optim: 0
ins_train$BLUEBOOK_MOD <- ins_train$BLUEBOOK^0.461

OLDCLAIM

OLDCLAIM is extremely right skewed. We’ll apply a log(x+1) transformation to reduce the overall skew.

boxcoxfit(ins_train$OLDCLAIM[ins_train$OLDCLAIM>0])
## Fitted parameters:
##      lambda        beta     sigmasq 
## -0.04511237  7.22517933  0.44041250 
## 
## Convergence code returned by optim: 0
ins_train$OLD_CLAIM_MOD <- log(ins_train$OLDCLAIM + 1)   

BUILD MODELS:

  1. Multiple linear regression models:

Model 1 - : In this model we will use all the variables. This can be our base model.We can see which variables are significant. This will help us in looking at the P-Values and removing the non significant variables.

train_amount <- ins_train[,-c(1)] #Training dataset with response of claim amount

amount_full_model1 <- lm(TARGET_AMT ~., data = train_amount)
summary(amount_full_model1)
## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = train_amount)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5894  -1698   -764    359 103840 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.847e+02  7.305e+02  -1.211 0.225919    
## KIDSDRIV       3.159e+02  1.132e+02   2.790 0.005282 ** 
## AGE            4.905e+00  7.091e+00   0.692 0.489098    
## HOMEKIDS       7.569e+01  6.576e+01   1.151 0.249771    
## YOJ           -2.644e+00  1.707e+01  -0.155 0.876919    
## INCOME        -4.386e-03  4.002e-03  -1.096 0.273171    
## PARENT1        5.697e+02  2.024e+02   2.815 0.004885 ** 
## HOME_VAL      -1.486e-04  1.094e-03  -0.136 0.891922    
## MSTATUS       -5.541e+02  1.498e+02  -3.698 0.000218 ***
## SEX            3.346e+02  1.625e+02   2.059 0.039497 *  
## TRAVTIME       1.196e+01  3.223e+00   3.713 0.000207 ***
## CAR_USE        8.086e+02  1.630e+02   4.961 7.14e-07 ***
## BLUEBOOK       3.550e-03  3.328e-02   0.107 0.915038    
## TIF           -4.800e+01  1.218e+01  -3.940 8.23e-05 ***
## RED_CAR               NA         NA      NA       NA    
## OLDCLAIM      -1.847e-02  8.994e-03  -2.053 0.040078 *  
## CLM_FREQ       3.730e+01  8.734e+01   0.427 0.669341    
## REVOKED        5.906e+02  1.755e+02   3.366 0.000766 ***
## MVR_PTS        1.648e+02  2.672e+01   6.170 7.18e-10 ***
## CAR_AGE       -2.682e+01  1.280e+01  -2.096 0.036147 *  
## URBANICITY     1.618e+03  1.405e+02  11.514  < 2e-16 ***
## HSDropout      1.071e+02  1.725e+02   0.621 0.534564    
## Bachelors     -1.978e+02  1.570e+02  -1.260 0.207677    
## Masters       -8.223e+01  2.302e+02  -0.357 0.720910    
## PhD            1.485e+02  2.933e+02   0.506 0.612678    
## Panel_Truck    1.720e+02  2.766e+02   0.622 0.533880    
## Pickup         3.426e+02  1.696e+02   2.020 0.043410 *  
## Sports_Car     1.016e+03  2.181e+02   4.659 3.22e-06 ***
## Van            4.622e+02  2.115e+02   2.186 0.028875 *  
## SUV            7.388e+02  1.795e+02   4.116 3.89e-05 ***
## Professional   7.615e+01  1.961e+02   0.388 0.697813    
## Blue_Collar           NA         NA      NA       NA    
## Clerical       7.619e+01  1.899e+02   0.401 0.688277    
## Doctor        -6.970e+02  3.888e+02  -1.793 0.073074 .  
## Lawyer        -9.410e+00  2.569e+02  -0.037 0.970778    
## Manager       -8.037e+02  2.031e+02  -3.957 7.64e-05 ***
## Home_Maker    -6.325e+01  2.908e+02  -0.218 0.827819    
## Student       -2.296e+02  2.839e+02  -0.809 0.418678    
## INCOME_MOD    -7.343e-01  4.403e+00  -0.167 0.867547    
## HOME_VAL_MOD  -2.905e+01  7.127e+01  -0.408 0.683578    
## BLUEBOOK_MOD   4.230e+00  1.238e+01   0.342 0.732642    
## OLD_CLAIM_MOD  4.485e+01  2.920e+01   1.536 0.124545    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4545 on 8121 degrees of freedom
## Multiple R-squared:  0.07105,    Adjusted R-squared:  0.06659 
## F-statistic: 15.93 on 39 and 8121 DF,  p-value: < 2.2e-16

Model 2 - Reduced model- I came up with this models after analyzing the output of model1. I removed all the variables that are not significant after seeing their P-Value.

amount_reduced_model2 <- update(amount_full_model1, .~.-HSDropout-Home_Maker-Bachelors-Masters-PhD-Panel_Truck-Blue_Collar-Professional-Student-HOMEKIDS-CAR_AGE-YOJ-Lawyer-SEX-AGE-Doctor-Clerical-INCOME-HOME_VAL-BLUEBOOK-RED_CAR--CLM_FREQ-INCOME_MOD-HOME_VAL_MOD-BLUEBOOK_MOD-OLD_CLAIM_MOD-OLDCLAIM)
summary(amount_reduced_model2)
## 
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + PARENT1 + MSTATUS + TRAVTIME + 
##     CAR_USE + TIF + CLM_FREQ + REVOKED + MVR_PTS + URBANICITY + 
##     Pickup + Sports_Car + Van + SUV + Manager, data = train_amount)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5685  -1698   -800    304 103866 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -669.790    217.622  -3.078 0.002093 ** 
## KIDSDRIV     393.362    102.187   3.849 0.000119 ***
## PARENT1      730.172    175.662   4.157 3.26e-05 ***
## MSTATUS     -534.231    118.994  -4.490 7.23e-06 ***
## TRAVTIME      12.187      3.225   3.779 0.000158 ***
## CAR_USE      899.712    112.672   7.985 1.59e-15 ***
## TIF          -46.869     12.184  -3.847 0.000121 ***
## CLM_FREQ     127.463     48.762   2.614 0.008965 ** 
## REVOKED      473.061    155.100   3.050 0.002296 ** 
## MVR_PTS      178.285     25.813   6.907 5.33e-12 ***
## URBANICITY  1467.231    134.707  10.892  < 2e-16 ***
## Pickup       317.736    151.471   2.098 0.035965 *  
## Sports_Car   790.710    176.754   4.473 7.80e-06 ***
## Van          437.232    188.775   2.316 0.020574 *  
## SUV          514.705    131.086   3.926 8.69e-05 ***
## Manager     -961.429    159.097  -6.043 1.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4556 on 8145 degrees of freedom
## Multiple R-squared:  0.06366,    Adjusted R-squared:  0.06194 
## F-statistic: 36.92 on 15 and 8145 DF,  p-value: < 2.2e-16

Interpretation of the Model1:

The Residual standard error is 4545

Multiple R-squared: 0.07105

Adjusted R-squared: 0.06659

F-statistic: 15.93 on 39 and 8121 DF

p-value: < 2.2e-16

Analysis of plot on residuals to verify normal distribution of residuals

sresid <- studres(amount_full_model1) 
hist(sresid, freq=FALSE, 
     main="Distribution of Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

Check for Homoscedasticity:

ncvTest(amount_full_model1)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 3209.905    Df = 1     p = 0
spreadLevelPlot(amount_full_model1)
## Warning in spreadLevelPlot.lm(amount_full_model1): 
## 1011 negative fitted values removed

## 
## Suggested power transformation:  0.000210097

Interpretation of the Model2:

The Residual standard error is 4556

Multiple R-squared: 0.06366

Adjusted R-squared: 0.06194

F-statistic: 36.92 on 15 and 8145 DF

p-value: < 2.2e-16

Analysis of plot on residuals to verify normal distribution of residuals

sresid <- studres(amount_reduced_model2) 
hist(sresid, freq=FALSE, 
     main="Distribution of Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

Check for Homoscedasticity:

ncvTest(amount_reduced_model2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 2762.247    Df = 1     p = 0
spreadLevelPlot(amount_reduced_model2)
## Warning in spreadLevelPlot.lm(amount_reduced_model2): 
## 858 negative fitted values removed

## 
## Suggested power transformation:  7.842944e-05
  1. Binary Logistic Regression models:

Model 3: Base Model: All variables without transformation. All of the variables will be tested to determine the base model they provided. This will allow us to see which variables are significant in our dataset, and allow us to make other models based on that.

train_flag <- ins_train[,-c(2)] #Training dataset with response of crash
flagfull <- glm(TARGET_FLAG ~.-INCOME_MOD-HOME_VAL_MOD-BLUEBOOK_MOD-OLD_CLAIM_MOD, data = train_flag, family = binomial(link='logit'))
summary(flagfull)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - INCOME_MOD - HOME_VAL_MOD - BLUEBOOK_MOD - 
##     OLD_CLAIM_MOD, family = binomial(link = "logit"), data = train_flag)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5940  -0.7152  -0.3989   0.6316   3.1401  
## 
## Coefficients: (2 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.360e+00  2.819e-01 -11.920  < 2e-16 ***
## KIDSDRIV      3.890e-01  6.119e-02   6.357 2.06e-10 ***
## AGE          -1.005e-03  4.018e-03  -0.250 0.802461    
## HOMEKIDS      4.965e-02  3.712e-02   1.337 0.181061    
## YOJ          -1.100e-02  8.585e-03  -1.281 0.200271    
## INCOME       -3.587e-06  1.078e-06  -3.326 0.000880 ***
## PARENT1       3.813e-01  1.095e-01   3.481 0.000499 ***
## HOME_VAL     -1.304e-06  3.418e-07  -3.815 0.000136 ***
## MSTATUS      -4.937e-01  8.354e-02  -5.910 3.41e-09 ***
## SEX           7.723e-02  1.002e-01   0.771 0.440722    
## TRAVTIME      1.464e-02  1.882e-03   7.778 7.36e-15 ***
## CAR_USE       7.769e-01  9.101e-02   8.537  < 2e-16 ***
## BLUEBOOK     -2.074e-05  5.258e-06  -3.945 7.99e-05 ***
## TIF          -5.556e-02  7.344e-03  -7.566 3.85e-14 ***
## RED_CAR              NA         NA      NA       NA    
## OLDCLAIM     -1.397e-05  3.910e-06  -3.574 0.000352 ***
## CLM_FREQ      1.958e-01  2.853e-02   6.864 6.68e-12 ***
## REVOKED       8.876e-01  9.132e-02   9.719  < 2e-16 ***
## MVR_PTS       1.130e-01  1.360e-02   8.308  < 2e-16 ***
## CAR_AGE      -5.727e-04  7.548e-03  -0.076 0.939521    
## URBANICITY    2.386e+00  1.129e-01  21.137  < 2e-16 ***
## HSDropout    -6.142e-03  9.479e-02  -0.065 0.948336    
## Bachelors    -4.135e-01  8.908e-02  -4.642 3.46e-06 ***
## Masters      -4.523e-01  1.340e-01  -3.375 0.000738 ***
## PhD          -3.481e-01  1.715e-01  -2.030 0.042385 *  
## Panel_Truck   5.037e-01  1.582e-01   3.184 0.001454 ** 
## Pickup        5.326e-01  9.999e-02   5.327 1.00e-07 ***
## Sports_Car    1.024e+00  1.299e-01   7.888 3.08e-15 ***
## Van           5.906e-01  1.254e-01   4.709 2.49e-06 ***
## SUV           7.670e-01  1.112e-01   6.897 5.30e-12 ***
## Professional -7.245e-02  1.109e-01  -0.654 0.513400    
## Blue_Collar          NA         NA      NA       NA    
## Clerical      1.324e-01  1.051e-01   1.260 0.207523    
## Doctor       -5.477e-01  2.599e-01  -2.108 0.035069 *  
## Lawyer       -1.933e-02  1.516e-01  -0.127 0.898557    
## Manager      -7.582e-01  1.229e-01  -6.168 6.91e-10 ***
## Home_Maker   -1.721e-02  1.483e-01  -0.116 0.907601    
## Student      -7.218e-02  1.282e-01  -0.563 0.573549    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7300.5  on 8125  degrees of freedom
## AIC: 7372.5
## 
## Number of Fisher Scoring iterations: 5

Model 4: We will now add the transformed data to the model.

train_flag <- ins_train[,-c(2)] #Training dataset with response of crash
flagfull_mod <- glm(TARGET_FLAG ~., data = train_flag, family = binomial(link='logit'))
summary(flagfull_mod)
## 
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"), 
##     data = train_flag)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5796  -0.7115  -0.3909   0.6205   3.1550  
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.911e+00  4.179e-01  -4.574 4.79e-06 ***
## KIDSDRIV       3.985e-01  6.149e-02   6.481 9.14e-11 ***
## AGE           -2.086e-03  4.043e-03  -0.516 0.605950    
## HOMEKIDS       3.398e-02  3.753e-02   0.905 0.365297    
## YOJ            5.434e-03  9.881e-03   0.550 0.582369    
## INCOME         1.837e-06  2.374e-06   0.774 0.439039    
## PARENT1        3.714e-01  1.101e-01   3.373 0.000745 ***
## HOME_VAL      -3.082e-07  6.582e-07  -0.468 0.639657    
## MSTATUS       -4.781e-01  8.679e-02  -5.509 3.62e-08 ***
## SEX            1.092e-01  1.007e-01   1.084 0.278163    
## TRAVTIME       1.480e-02  1.891e-03   7.825 5.06e-15 ***
## CAR_USE        7.736e-01  9.148e-02   8.456  < 2e-16 ***
## BLUEBOOK       4.075e-05  1.969e-05   2.069 0.038513 *  
## TIF           -5.478e-02  7.364e-03  -7.439 1.01e-13 ***
## RED_CAR               NA         NA      NA       NA    
## OLDCLAIM      -2.615e-05  4.774e-06  -5.476 4.35e-08 ***
## CLM_FREQ       4.548e-02  4.412e-02   1.031 0.302637    
## REVOKED        9.527e-01  9.305e-02  10.238  < 2e-16 ***
## MVR_PTS        9.540e-02  1.410e-02   6.768 1.31e-11 ***
## CAR_AGE        2.274e-05  7.562e-03   0.003 0.997600    
## URBANICITY     2.361e+00  1.141e-01  20.700  < 2e-16 ***
## HSDropout     -4.795e-02  9.597e-02  -0.500 0.617378    
## Bachelors     -4.035e-01  8.963e-02  -4.501 6.75e-06 ***
## Masters       -4.650e-01  1.347e-01  -3.451 0.000558 ***
## PhD           -4.661e-01  1.747e-01  -2.668 0.007625 ** 
## Panel_Truck    3.581e-01  1.626e-01   2.202 0.027696 *  
## Pickup         5.378e-01  1.004e-01   5.358 8.40e-08 ***
## Sports_Car     1.024e+00  1.304e-01   7.855 4.01e-15 ***
## Van            5.956e-01  1.255e-01   4.745 2.08e-06 ***
## SUV            7.944e-01  1.122e-01   7.083 1.41e-12 ***
## Professional  -8.353e-02  1.111e-01  -0.752 0.452119    
## Blue_Collar           NA         NA      NA       NA    
## Clerical       8.731e-02  1.061e-01   0.823 0.410447    
## Doctor        -5.135e-01  2.588e-01  -1.984 0.047251 *  
## Lawyer        -2.688e-02  1.522e-01  -0.177 0.859771    
## Manager       -7.676e-01  1.230e-01  -6.238 4.42e-10 ***
## Home_Maker    -2.989e-01  1.704e-01  -1.754 0.079346 .  
## Student       -4.639e-01  1.590e-01  -2.917 0.003531 ** 
## INCOME_MOD    -8.396e-03  2.592e-03  -3.240 0.001197 ** 
## HOME_VAL_MOD  -6.994e-02  4.124e-02  -1.696 0.089928 .  
## BLUEBOOK_MOD  -2.301e-02  7.155e-03  -3.216 0.001300 ** 
## OLD_CLAIM_MOD  6.907e-02  1.510e-02   4.574 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7254.7  on 8121  degrees of freedom
## AIC: 7334.7
## 
## Number of Fisher Scoring iterations: 5

Model5: We will only keep only the significant variables for our reduced model3.

train_flag <- ins_train[,-c(2)] #Training dataset with response of crash
flag_reduced_mod <- glm(TARGET_FLAG ~.-AGE-HOMEKIDS-YOJ-INCOME-HOME_VAL-SEX-RED_CAR-CLM_FREQ-CAR_AGE-HSDropout-Professional-Blue_Collar-Clerical-Lawyer-Home_Maker-HOME_VAL_MOD-Student-Doctor, data = train_flag, family = binomial(link='logit'))
summary(flag_reduced_mod)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - AGE - HOMEKIDS - YOJ - INCOME - 
##     HOME_VAL - SEX - RED_CAR - CLM_FREQ - CAR_AGE - HSDropout - 
##     Professional - Blue_Collar - Clerical - Lawyer - Home_Maker - 
##     HOME_VAL_MOD - Student - Doctor, family = binomial(link = "logit"), 
##     data = train_flag)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4861  -0.7200  -0.3998   0.6352   3.1436  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.293e+00  3.328e-01  -6.890 5.57e-12 ***
## KIDSDRIV       4.223e-01  5.501e-02   7.677 1.63e-14 ***
## PARENT1        4.533e-01  9.423e-02   4.810 1.51e-06 ***
## MSTATUS       -6.218e-01  6.876e-02  -9.042  < 2e-16 ***
## TRAVTIME       1.481e-02  1.882e-03   7.868 3.59e-15 ***
## CAR_USE        7.868e-01  7.175e-02  10.965  < 2e-16 ***
## BLUEBOOK       3.501e-05  1.892e-05   1.851  0.06420 .  
## TIF           -5.372e-02  7.335e-03  -7.324 2.40e-13 ***
## OLDCLAIM      -2.723e-05  4.673e-06  -5.826 5.68e-09 ***
## REVOKED        9.706e-01  9.257e-02  10.485  < 2e-16 ***
## MVR_PTS        9.784e-02  1.402e-02   6.978 3.00e-12 ***
## URBANICITY     2.365e+00  1.141e-01  20.736  < 2e-16 ***
## Bachelors     -4.712e-01  7.483e-02  -6.297 3.03e-10 ***
## Masters       -5.234e-01  8.887e-02  -5.890 3.87e-09 ***
## PhD           -6.617e-01  1.260e-01  -5.251 1.51e-07 ***
## Panel_Truck    4.290e-01  1.460e-01   2.939  0.00329 ** 
## Pickup         5.274e-01  9.814e-02   5.374 7.69e-08 ***
## Sports_Car     9.240e-01  1.071e-01   8.628  < 2e-16 ***
## Van            6.242e-01  1.194e-01   5.229 1.71e-07 ***
## SUV            6.976e-01  8.554e-02   8.155 3.50e-16 ***
## Manager       -7.115e-01  1.065e-01  -6.682 2.36e-11 ***
## INCOME_MOD    -5.309e-03  7.736e-04  -6.863 6.75e-12 ***
## BLUEBOOK_MOD  -2.195e-02  7.004e-03  -3.134  0.00172 ** 
## OLD_CLAIM_MOD  8.201e-02  9.799e-03   8.369  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7290.2  on 8137  degrees of freedom
## AIC: 7338.2
## 
## Number of Fisher Scoring iterations: 5

MODEL SELECTION:

I would like to select model5 for Binary Logistic Regression models. The AIC and residual deviance for this model seemed to give the best values that would be suited for the prediction. Below is the ROC curve for model5 and to me it looks good. So i would like to proceed with model5. For Multiple linear model i wouldd like to go for model2.

train_flag$predict <- predict(flag_reduced_mod, train_flag, type='response')

roc_model3 <- roc(train_flag$TARGET_FLAG, train_flag$predict, plot=T, asp=NA,
                legacy.axes=T, main = "ROC Curve", col="blue")

roc_model3["auc"]
## $auc
## Area under the curve: 0.8146

Now lets do the confusion matrix:

train_flag$predict_target <- ifelse(train_flag$predict >=0.5, 1, 0)
train_flag$predict_target <- as.integer(train_flag$predict_target)
myvars <- c("TARGET_FLAG", "predict_target")
train_flag_cm <- train_flag[myvars]
cm <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
knitr:: kable(cm)
0 1
0 5560 1254
1 448 899
Accuracy <- function(data) {
tb <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
return((TP+TN)/(TP+FP+TN+FN))
}
Accuracy(data)
## [1] 0.7914471
CER <- function(data) {
tb <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
return((FP+FN)/(TP+FP+TN+FN))
}
CER(data)
## [1] 0.2085529
Precision <- function(data) {
tb <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
TP=tb[2,2]
FP=tb[1,2]
return((TP)/(TP+FP))
}
Precision(data)
## [1] 0.4175569
Sensitivity <- function(data) {
tb <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
TP=tb[2,2]
FN=tb[2,1]
return((TP)/(TP+FN))
}
Sensitivity(data)
## [1] 0.6674091
Specificity <- function(data) {
tb <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
return((TN)/(TN+FP))
}
Specificity(data)
## [1] 0.8159671
F1_score <- function(data) {
tb <- table(train_flag_cm$predict_target,train_flag_cm$TARGET_FLAG)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
Precision = (TP)/(TP+FP)
Sensitivity = (TP)/(TP+FN)
Precision =(TP)/(TP+FP)
return((2*Precision*Sensitivity)/(Precision+Sensitivity))
}
F1_score(data)
## [1] 0.5137143

TEST DATA PREPARATION AND TESTING THE MODEL ON EVALUATION DATA:

In the final step we will test our model by using the test data.

ins_eval <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-4/master/insurance_evaluation_data.csv")


ins_eval$INCOME <- as.numeric(str_replace_all(ins_eval$INCOME, "[[:punct:]\\$]",""))
ins_eval$HOME_VAL <- as.numeric(str_replace_all(ins_eval$HOME_VAL, "[[:punct:]\\$]",""))
ins_eval$BLUEBOOK <- as.numeric(str_replace_all(ins_eval$BLUEBOOK, "[[:punct:]\\$]",""))
ins_eval$OLDCLAIM <- as.numeric(str_replace_all(ins_eval$OLDCLAIM, "[[:punct:]\\$]",""))


#Convert indicator variables to 0s and 1s; 1 = Yes, Male for Sex, Commercial for Car Use, Red for RED_CAR, and Highly Urban for URBANICITY
ins_eval$PARENT1 <- ifelse(ins_eval$PARENT1=="Yes", 1, 0)
ins_eval$MSTATUS <- ifelse(ins_eval$MSTATUS=="Yes", 1, 0)
ins_eval$SEX <- ifelse(ins_eval$SEX=="M", 1, 0)
ins_eval$CAR_USE <- ifelse(ins_eval$CAR_USE=="Commercial", 1, 0)
ins_eval$RED_CAR <- ifelse(ins_eval$RED_CAR=="Yes", 1, 0)
ins_eval$REVOKED <- ifelse(ins_eval$REVOKED=="Yes", 1, 0)
ins_eval$URBANICITY <- ifelse(ins_eval$URBANICITY == "Highly Urban/ Urban", 1, 0)

#Convert categorical predictor values to indicator variables - EDUCATION, CAR_TYPE, JOB

#EDUCATION, High school graduate is base case
ins_eval$HSDropout <- ifelse(ins_eval$EDUCATION=="<High School", 1, 0)
ins_eval$Bachelors <- ifelse(ins_eval$EDUCATION=="Bachelors", 1, 0)
ins_eval$Masters <- ifelse(ins_eval$EDUCATION=="Masters", 1, 0)
ins_eval$PhD <- ifelse(ins_eval$EDUCATION=="PhD", 1, 0)

#CAR_TYPE, base case is minivan
ins_eval$Panel_Truck <- ifelse(ins_eval$CAR_TYPE=="Panel Truck", 1, 0)
ins_eval$Pickup <- ifelse(ins_eval$CAR_TYPE=="Pickup", 1, 0)
ins_eval$Sports_Car <- ifelse(ins_eval$CAR_TYPE=="Sports Car", 1, 0)
ins_eval$Van <- ifelse(ins_eval$CAR_TYPE=="Van", 1, 0)
ins_eval$SUV <- ifelse(ins_eval$CAR_TYPE=="z_SUV", 1, 0)

#JOB, base case is ""
ins_eval$Professional <- ifelse(ins_eval$JOB == "Professional", 1, 0)
ins_eval$Blue_Collar <- ifelse(ins_eval$JOB == "Professional", 1, 0)
ins_eval$Clerical <- ifelse(ins_eval$JOB == "Clerical", 1, 0)
ins_eval$Doctor <- ifelse(ins_eval$JOB == "Doctor", 1, 0)
ins_eval$Lawyer <- ifelse(ins_eval$JOB == "Lawyer", 1, 0)
ins_eval$Manager <- ifelse(ins_eval$JOB == "Manager", 1, 0)
ins_eval$Home_Maker <- ifelse(ins_eval$JOB == "Home Maker", 1, 0)
ins_eval$Student <- ifelse(ins_eval$JOB == "Student", 1, 0)

ins_eval <- ins_eval %>% dplyr::select(-c(INDEX,EDUCATION,CAR_TYPE,JOB))

fillwithmedian <- function(x) {
  median_val = median(x, na.rm = TRUE)
  x[is.na(x)] = median_val
  return(x)
}

ins_eval <- data.frame(lapply(ins_eval, fillwithmedian))


ins_eval$INCOME_MOD <- ins_eval$INCOME ^0.433
ins_eval$HOME_VAL_MOD <- ins_eval$HOME_VAL^0.113
ins_eval$BLUEBOOK_MOD <- ins_eval$BLUEBOOK^0.461
ins_eval$OLD_CLAIM_MOD <- log(ins_eval$OLDCLAIM + 1) 

ins_eval$predict_prob <- predict(flag_reduced_mod, ins_eval, type='response')
ins_eval$predict_target <- ifelse(ins_eval$predict_prob >= 0.50, 1,0)

write.csv(ins_eval,"Evaluation_Data.csv", row.names=FALSE)

ins_eval$TARGET_AMT1 <- 0

ins_eval1 <- filter(ins_eval, predict_target == 1)
ins_eval1$predict_target<-as.numeric(ins_eval1$predict_target)

ins_eval1$TARGET_AMT1 <- predict(amount_reduced_model2, newdata=ins_eval1)

write.csv(ins_eval1,"Evaluation_Full_Data.csv", row.names=FALSE)