train_df <- read.csv("https://raw.githubusercontent.com/jconno/R-data/master/insurance_training_data.csv")
test_df <- read.csv("https://raw.githubusercontent.com/jconno/R-data/master/insurance-evaluation-data.csv")

DATA EXPLORATION

Data Summary

First, let’s correct the formats/values of the data

train_df$INDEX <- NULL
train_df$INCOME <- as.numeric(gsub('[$,]', '', train_df$INCOME))
train_df$HOME_VAL <- as.numeric(gsub('[$,]', '', train_df$HOME_VAL))
train_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', train_df$BLUEBOOK))
train_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', train_df$OLDCLAIM))
train_df$PARENT1 <- gsub("z_", "", train_df$PARENT1)
train_df$MSTATUS <- gsub("z_", "", train_df$MSTATUS)
train_df$SEX <- gsub("z_", "", train_df$SEX)
train_df$EDUCATION <- gsub("z_", "", train_df$EDUCATION)
train_df$EDUCATION <- gsub("<", "Less Than", train_df$EDUCATION)
train_df$JOB <- gsub("z_", "", train_df$JOB)
train_df$CAR_TYPE <- gsub("z_", "", train_df$CAR_TYPE)
train_df$URBANICITY <- ifelse(train_df$URBANICITY=="Highly Urban/ Urban","Urban","Rural")
train_df$JOB[train_df$JOB == ""] <- NA
train_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX", "EDUCATION", "JOB","CAR_TYPE",
           "RED_CAR", "URBANICITY", "CAR_USE","REVOKED")] <-             
                lapply(train_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX", 
                                   "EDUCATION", "JOB","CAR_TYPE", "RED_CAR",
                                   "URBANICITY", "CAR_USE","REVOKED")], factor)

Below is the summary of the cleaned up data.

summary(train_df)
##  TARGET_FLAG   TARGET_AMT        KIDSDRIV           AGE           HOMEKIDS     
##  0:6008      Min.   :     0   Min.   :0.0000   Min.   :16.00   Min.   :0.0000  
##  1:2153      1st Qu.:     0   1st Qu.:0.0000   1st Qu.:39.00   1st Qu.:0.0000  
##              Median :     0   Median :0.0000   Median :45.00   Median :0.0000  
##              Mean   :  1504   Mean   :0.1711   Mean   :44.79   Mean   :0.7212  
##              3rd Qu.:  1036   3rd Qu.:0.0000   3rd Qu.:51.00   3rd Qu.:1.0000  
##              Max.   :107586   Max.   :4.0000   Max.   :81.00   Max.   :5.0000  
##                                                NA's   :6                       
##       YOJ           INCOME       PARENT1       HOME_VAL      MSTATUS   
##  Min.   : 0.0   Min.   :     0   No :7084   Min.   :     0   No :3267  
##  1st Qu.: 9.0   1st Qu.: 28097   Yes:1077   1st Qu.:     0   Yes:4894  
##  Median :11.0   Median : 54028              Median :161160             
##  Mean   :10.5   Mean   : 61898              Mean   :154867             
##  3rd Qu.:13.0   3rd Qu.: 85986              3rd Qu.:238724             
##  Max.   :23.0   Max.   :367030              Max.   :885282             
##  NA's   :454    NA's   :445                 NA's   :464                
##  SEX                     EDUCATION              JOB          TRAVTIME     
##  F:4375   Bachelors           :2242   Blue Collar :1825   Min.   :  5.00  
##  M:3786   High School         :2330   Clerical    :1271   1st Qu.: 22.00  
##           Less ThanHigh School:1203   Professional:1117   Median : 33.00  
##           Masters             :1658   Manager     : 988   Mean   : 33.49  
##           PhD                 : 728   Lawyer      : 835   3rd Qu.: 44.00  
##                                       (Other)     :1599   Max.   :142.00  
##                                       NA's        : 526                   
##        CAR_USE        BLUEBOOK          TIF                CAR_TYPE   
##  Commercial:3029   Min.   : 1500   Min.   : 1.000   Minivan    :2145  
##  Private   :5132   1st Qu.: 9280   1st Qu.: 1.000   Panel Truck: 676  
##                    Median :14440   Median : 4.000   Pickup     :1389  
##                    Mean   :15710   Mean   : 5.351   Sports Car : 907  
##                    3rd Qu.:20850   3rd Qu.: 7.000   SUV        :2294  
##                    Max.   :69740   Max.   :25.000   Van        : 750  
##                                                                       
##  RED_CAR       OLDCLAIM        CLM_FREQ      REVOKED       MVR_PTS      
##  no :5783   Min.   :    0   Min.   :0.0000   No :7161   Min.   : 0.000  
##  yes:2378   1st Qu.:    0   1st Qu.:0.0000   Yes:1000   1st Qu.: 0.000  
##             Median :    0   Median :0.0000              Median : 1.000  
##             Mean   : 4037   Mean   :0.7986              Mean   : 1.696  
##             3rd Qu.: 4636   3rd Qu.:2.0000              3rd Qu.: 3.000  
##             Max.   :57037   Max.   :5.0000              Max.   :13.000  
##                                                                         
##     CAR_AGE       URBANICITY  
##  Min.   :-3.000   Rural:1669  
##  1st Qu.: 1.000   Urban:6492  
##  Median : 8.000               
##  Mean   : 8.328               
##  3rd Qu.:12.000               
##  Max.   :28.000               
##  NA's   :510

YOJ, INCOME, HOME_VAL, CAR_AGE have a lot of missing values, we will perform multiple imputations to fill in the missing values.
CAR_AGE also has an incorrect value of -3. We will also replace it by imputation.

Box Plots

data.m <- melt(train_df[c("TARGET_FLAG","KIDSDRIV","AGE","HOMEKIDS","YOJ","INCOME",
                          "HOME_VAL", "TRAVTIME","BLUEBOOK","TIF","OLDCLAIM","CLM_FREQ", 
                          "MVR_PTS","CAR_AGE")], id.vars = 'TARGET_FLAG')
ggplot(data.m, aes(x = variable, y = value, fill = TARGET_FLAG)) + geom_boxplot() + 
  facet_wrap(~ variable, scales = 'free') + theme_classic()

The box plots show that a lot of numric variables are right skewed, we will transform the variables to reduce outliers.

Distribution plots

plot_KIDSDRIV <- ggplot(train_df, aes(x=KIDSDRIV, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.3)
plot_AGE <- ggplot(train_df, aes(x=AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOMEKIDS <- ggplot(train_df, aes(x=HOMEKIDS, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.4)
plot_YOJ <- ggplot(train_df, aes(x=YOJ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_INCOME <- ggplot(train_df, aes(x=INCOME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOME_VAL <- ggplot(train_df, aes(x=HOME_VAL, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_TRAVTIME <- ggplot(train_df, aes(x=TRAVTIME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_BLUEBOOK <- ggplot(train_df, aes(x=BLUEBOOK, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_TIF <- ggplot(train_df, aes(x=TIF, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_OLDCLAIM <- ggplot(train_df, aes(x=OLDCLAIM, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_CLM_FREQ <- ggplot(train_df, aes(x=CLM_FREQ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.4)
plot_MVR_PTS <- ggplot(train_df, aes(x=MVR_PTS, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.4)
plots_CAR_AGE <- ggplot(train_df, aes(x=CAR_AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_KIDSDRIV+plot_AGE+plot_HOMEKIDS+plot_YOJ+plot_INCOME+plot_HOME_VAL+
  plot_TRAVTIME+plot_BLUEBOOK+plot_TIF+plot_OLDCLAIM+plot_CLM_FREQ+
  plot_MVR_PTS+plots_CAR_AGE+
  plot_layout(ncol = 4, guides = "collect")

Most of the distributions are similar for target = 0 and target = 1. OLDCLAIM and CLM_FREQ are good candidates predicting whether there is a crash.

We can also look at the categorical variables:

plot_PARENT1 <- ggplot(train_df,aes(x=PARENT1,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_MSTATUS <- ggplot(train_df,aes(x=MSTATUS,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_SEX <- ggplot(train_df,aes(x=SEX,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_EDUCATION <- ggplot(train_df,aes(x=substring(train_df$EDUCATION,1,5),fill=TARGET_FLAG))+
                  geom_bar(position = position_dodge())+xlab("EDUCATION")
plot_JOB <- ggplot(train_df,aes(x=substring(train_df$JOB,1,2),fill=TARGET_FLAG))+
                  geom_bar(position = position_dodge())+xlab("JOB")
plot_CAR_TYPE <- ggplot(train_df,aes(x=substring(train_df$CAR_TYPE,1,4),fill=TARGET_FLAG))+
                  geom_bar(position = position_dodge())+xlab("CAR_TYPE")
plot_RED_CAR <- ggplot(train_df,aes(x=RED_CAR,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_URBANICITY <- ggplot(train_df,aes(x=URBANICITY,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_CAR_USE <- ggplot(train_df,aes(x=CAR_USE,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_REVOKED <- ggplot(train_df,aes(x=REVOKED,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_PARENT1+plot_MSTATUS+plot_SEX+plot_EDUCATION+plot_JOB+plot_CAR_TYPE+plot_RED_CAR+
  plot_URBANICITY+plot_CAR_USE+plot_REVOKED+plot_layout(ncol = 3, guides = "collect")

PARENT1, MSTATUS, URBANICITY, CAR_USE AND REVOKED seem to have notable difference in the distributions between target = 0 and target = 1

Correlations

Now let’s look at the correlations between the variables

corrplot::corrplot(cor(train_df[c("KIDSDRIV","AGE","HOMEKIDS","YOJ","INCOME",
                                  "HOME_VAL","BLUEBOOK","TIF","OLDCLAIM", "CLM_FREQ",
                                  "MVR_PTS","CAR_AGE")], use = "na.or.complete"), 
                   method = 'number', type = 'lower', diag = FALSE, tl.srt = 0.1)

None of the variables have very strong correlations. There is no serious problem of multi-collinearity.

DATA PREPARATION

Data Imputation

#save the indicators of missing values. It will be used to verify the distributions
#of the imputed values
YOJ_NA <- is.na(train_df$YOJ)
INCOME_NA <- is.na(train_df$INCOME)
HOME_VAL_NA <- is.na(train_df$HOME_VAL)
CAR_AGE_NA <- is.na(train_df$CAR_AGE)
#remove incorrect CAR_AGE value for imputation
train_df$CAR_AGE[train_df$CAR_AGE < 0] <- NA
#temporary exclude TARGET_FLAG and TARGET_AMT in our imputation
TARGET_FLAG <- train_df$TARGET_FLAG
TARGET_AMT <- train_df$TARGET_AMT
train_df$TARGET_FLAG <- NULL
train_df$TARGET_AMT <- NULL
#save the imputation models to impute the test data set later
mickey <- parlmice(train_df, maxit = 5, m = 1, printFlag = FALSE, seed = 2022, cluster.seed = 2022)
#save the imputation result
train_df <- complete(mickey,1)
#Add TARGET_FLAG and TARGET_AMT back to our dataframe
train_df$TARGET_FLAG <- TARGET_FLAG
train_df$TARGET_AMT <- TARGET_AMT
TARGET_FLAG <- NULL
TARGET_AMT <- NULL
#write.csv(train_df,"train_df.csv", row.names = FALSE)
# train_df <- read.csv("train_df.csv", stringsAsFactors = TRUE)
# train_df$TARGET_FLAG <- as.factor(train_df$TARGET_FLAG)

We can compare the imputed data values and the original data values.
The plots on the top row below show the distributions of the values from the original data.
The plots on the bottom row below show the distributions of the imputed values.

plot_YOJ <- ggplot(train_df[!YOJ_NA,], aes(x=YOJ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_INCOME <- ggplot(train_df[!INCOME_NA,], aes(x=INCOME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOME_VAL <- ggplot(train_df[!HOME_VAL_NA,], aes(x=HOME_VAL, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_CAR_AGE <- ggplot(train_df[!CAR_AGE_NA,], aes(x=CAR_AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_YOJ2 <- ggplot(train_df[YOJ_NA,], aes(x=YOJ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_INCOME2 <- ggplot(train_df[INCOME_NA,], aes(x=INCOME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOME_VAL2 <- ggplot(train_df[HOME_VAL_NA,], aes(x=HOME_VAL, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_CAR_AGE2 <- ggplot(train_df[CAR_AGE_NA,], aes(x=CAR_AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_YOJ+plot_INCOME+plot_HOME_VAL+plot_CAR_AGE+
  plot_YOJ2+plot_INCOME2+plot_HOME_VAL2+plot_CAR_AGE2+
  plot_layout(ncol = 4, guides = "collect")

The distributions look similar and so the imputed values are plausible.

Data Transformation

YOJ: The density plot shows the variable is zero-inflated. The coefficient for YOJ=0 and the coefficient for YOJ>0 may be significantly different. Therefore, we would add a dummy variable indicating whether the variable is 0.

HOME_VAL: The variable is also zero-inflated, we would add a dummy variable indicating whether the person has a house.

INCOME: We would add a dummy variable indicating whether the person has a job. Practically, it is a key factor in insurance pricing.

OLDCLAIM: We would add a dummy variable indicating whether the person had an old claim. The coefficient for OLDCLAIM=0 and the coefficient for OLDCLAIM>0 may be significantly different.

INCOME, HOME_VAL, BLUEBOOK, OLDCLAIM: We will log transform all monetary variables as they are right-skewed.

train_df$YOJ_Y <- as.factor(ifelse(train_df$YOJ == 0,0,1))
train_df$INCOME_Y <- as.factor(ifelse(train_df$INCOME == 0,0,1))
train_df$HOME_VAL_Y <- as.factor(ifelse(train_df$HOME_VAL == 0,0,1))
train_df$OLDCLAIM_Y <- as.factor(ifelse(train_df$OLDCLAIM == 0,0,1))
train_df$INCOME_LOG <- log(train_df$INCOME+1)
train_df$HOME_VAL_LOG <- log(train_df$HOME_VAL+1)
train_df$BLUEBOOK_LOG <- log(train_df$BLUEBOOK)
train_df$OLDCLAIM_LOG <- log(train_df$OLDCLAIM+1)
train_df$INCOME <- NULL
train_df$HOME_VAL <- NULL
train_df$BLUEBOOK <- NULL
train_df$OLDCLAIM <- NULL
logistic_train_df <- train_df

Logistic Models

Preliminary model

First, let build a test model to see if any additional transformations are needed to fit our logistic models

test_model <- glm(TARGET_FLAG~.-TARGET_AMT, family = binomial, logistic_train_df)
marginalModelPlots(test_model,~KIDSDRIV + AGE + HOMEKIDS + YOJ  + BLUEBOOK_LOG +  
                    HOME_VAL_LOG + TRAVTIME+ INCOME_LOG + TIF + OLDCLAIM_LOG +
                     CLM_FREQ + MVR_PTS + CAR_AGE, layout =c(4,4))

Additonal Transformations

Additional transformation are needed for KIDSDRIV, AGE, HOMEKIDS, TRAVTIME, and MVR_PTS

From the density plots above, the see that AGE is approximately normal for both target = 0 and target = 1. From the text book A Modern Approach To Regression With R, if the variance of the variable is different for the two response value, then a squared term should be added.

data.frame(Variance_of_AGE_TARGET0=c(var(logistic_train_df$AGE[logistic_train_df$TARGET_FLAG==0])),
           Variance_of_AGE_TARGET1=c(var(logistic_train_df$AGE[logistic_train_df$TARGET_FLAG==1])))
var.test(AGE ~ TARGET_FLAG, logistic_train_df, alternative = "two.sided")
## 
##  F test to compare two variances
## 
## data:  AGE by TARGET_FLAG
## F = 0.73556, num df = 6007, denom df = 2152, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6856687 0.7881551
## sample estimates:
## ratio of variances 
##          0.7355552

The variance is apparently different. We will add a squared term and check if that fits the model.

logistic_train_df$AGE_Squared <- logistic_train_df$AGE^2

For KIDSDRIV, HOMEKIDS, TRAVTIME, and MVR_PTS, power transformations are used and the powers are determined by trial and error

logistic_train_df$KIDSDRIV_0.5 <- (logistic_train_df$KIDSDRIV)^0.5
logistic_train_df$HOMEKIDS_0.5 <- (logistic_train_df$HOMEKIDS)^0.5
logistic_train_df$MVR_PTS_3 <- (logistic_train_df$MVR_PTS)^3
logistic_train_df$TRAVTIME_0.33 <- (logistic_train_df$TRAVTIME)^0.33
logistic_train_df$KIDSDRIV <- NULL
logistic_train_df$HOMEKIDS <- NULL
logistic_train_df$MVR_PTS <- NULL
logistic_train_df$TRAVTIME <- NULL

After all the transformations, the test model now fits our data well

test_model <- glm(TARGET_FLAG~.-TARGET_AMT, family = binomial, logistic_train_df)
marginalModelPlots(test_model,~KIDSDRIV_0.5 + AGE + HOMEKIDS_0.5 + YOJ  + BLUEBOOK_LOG +  
                    HOME_VAL_LOG + TRAVTIME_0.33 + INCOME_LOG + TIF + OLDCLAIM_LOG +
                     CLM_FREQ + MVR_PTS_3 + CAR_AGE, layout =c(4,4))

Building Models

Full Model

First we build a full model with all predictors

logi_full <- glm(TARGET_FLAG~.-TARGET_AMT, family = binomial, logistic_train_df)
summary(logi_full)
## 
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial, 
##     data = logistic_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6561  -0.7040  -0.3916   0.5906   3.0517  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    4.0329974  0.8904557   4.529 5.92e-06 ***
## AGE                           -0.2054268  0.0261087  -7.868 3.60e-15 ***
## YOJ                            0.0094186  0.0121940   0.772 0.439882    
## PARENT1Yes                     0.2551747  0.1177500   2.167 0.030228 *  
## MSTATUSYes                    -0.5506845  0.0907463  -6.068 1.29e-09 ***
## SEXM                           0.0142482  0.1087067   0.131 0.895720    
## EDUCATIONHigh School           0.4277573  0.0915449   4.673 2.97e-06 ***
## EDUCATIONLess ThanHigh School  0.3945122  0.1189492   3.317 0.000911 ***
## EDUCATIONMasters               0.0741044  0.1271528   0.583 0.560029    
## EDUCATIONPhD                   0.2959814  0.1703770   1.737 0.082349 .  
## JOBClerical                    0.0764184  0.1074750   0.711 0.477063    
## JOBDoctor                     -1.0689510  0.2486535  -4.299 1.72e-05 ***
## JOBHome Maker                 -0.3877938  0.1773726  -2.186 0.028792 *  
## JOBLawyer                     -0.1810184  0.1681735  -1.076 0.281758    
## JOBManager                    -0.8139390  0.1312582  -6.201 5.61e-10 ***
## JOBProfessional               -0.1443933  0.1171210  -1.233 0.217629    
## JOBStudent                    -0.4648306  0.1679583  -2.768 0.005648 ** 
## CAR_USEPrivate                -0.7841910  0.0895536  -8.757  < 2e-16 ***
## TIF                           -0.0558025  0.0073917  -7.549 4.37e-14 ***
## CAR_TYPEPanel Truck            0.5798982  0.1507360   3.847 0.000120 ***
## CAR_TYPEPickup                 0.5938969  0.1011924   5.869 4.38e-09 ***
## CAR_TYPESports Car             0.8766221  0.1293159   6.779 1.21e-11 ***
## CAR_TYPESUV                    0.7113846  0.1078339   6.597 4.19e-11 ***
## CAR_TYPEVan                    0.6913626  0.1261029   5.483 4.19e-08 ***
## RED_CARyes                    -0.0529047  0.0874085  -0.605 0.545008    
## CLM_FREQ                       0.0512862  0.0448578   1.143 0.252912    
## REVOKEDYes                     0.8619659  0.0892525   9.658  < 2e-16 ***
## CAR_AGE                       -0.0049525  0.0076018  -0.651 0.514735    
## URBANICITYUrban                2.3670806  0.1134796  20.859  < 2e-16 ***
## YOJ_Y1                        -0.1591415  0.3094673  -0.514 0.607082    
## INCOME_Y1                      0.3383702  0.5821690   0.581 0.561090    
## HOME_VAL_Y1                    2.7303698  1.3884451   1.966 0.049242 *  
## OLDCLAIM_Y1                    1.7604446  0.4177531   4.214 2.51e-05 ***
## INCOME_LOG                    -0.0947646  0.0552415  -1.715 0.086261 .  
## HOME_VAL_LOG                  -0.2489162  0.1144621  -2.175 0.029656 *  
## BLUEBOOK_LOG                  -0.3311012  0.0596444  -5.551 2.84e-08 ***
## OLDCLAIM_LOG                  -0.1597333  0.0459161  -3.479 0.000504 ***
## AGE_Squared                    0.0022731  0.0002855   7.962 1.69e-15 ***
## KIDSDRIV_0.5                   0.6879048  0.0860413   7.995 1.30e-15 ***
## HOMEKIDS_0.5                   0.0106102  0.0694718   0.153 0.878613    
## MVR_PTS_3                      0.0016706  0.0002621   6.373 1.85e-10 ***
## TRAVTIME_0.33                  0.4424007  0.0556728   7.946 1.92e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7195.2  on 8119  degrees of freedom
## AIC: 7279.2
## 
## Number of Fisher Scoring iterations: 5

Backward Elimination by AIC

Starting with our full model, perform backward elimination by comparing the AIC of the models.

logi_AIC <- step(logi_full,trace=0)
summary(logi_AIC)
## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE + PARENT1 + MSTATUS + EDUCATION + 
##     JOB + CAR_USE + TIF + CAR_TYPE + REVOKED + URBANICITY + HOME_VAL_Y + 
##     OLDCLAIM_Y + INCOME_LOG + HOME_VAL_LOG + BLUEBOOK_LOG + OLDCLAIM_LOG + 
##     AGE_Squared + KIDSDRIV_0.5 + MVR_PTS_3 + TRAVTIME_0.33, family = binomial, 
##     data = logistic_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6611  -0.7024  -0.3907   0.5939   3.0425  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    3.9489782  0.8090148   4.881 1.05e-06 ***
## AGE                           -0.2075123  0.0251120  -8.263  < 2e-16 ***
## PARENT1Yes                     0.2749929  0.1016652   2.705 0.006833 ** 
## MSTATUSYes                    -0.5318116  0.0858020  -6.198 5.71e-10 ***
## EDUCATIONHigh School           0.4573110  0.0842096   5.431 5.62e-08 ***
## EDUCATIONLess ThanHigh School  0.4355342  0.1092115   3.988 6.66e-05 ***
## EDUCATIONMasters               0.0499713  0.1207856   0.414 0.679080    
## EDUCATIONPhD                   0.2709381  0.1658346   1.634 0.102304    
## JOBClerical                    0.0864041  0.1061130   0.814 0.415493    
## JOBDoctor                     -1.0766600  0.2484480  -4.334 1.47e-05 ***
## JOBHome Maker                 -0.3531558  0.1677204  -2.106 0.035237 *  
## JOBLawyer                     -0.1863971  0.1681038  -1.109 0.267507    
## JOBManager                    -0.8162009  0.1311596  -6.223 4.88e-10 ***
## JOBProfessional               -0.1469552  0.1170122  -1.256 0.209153    
## JOBStudent                    -0.4183337  0.1484795  -2.817 0.004841 ** 
## CAR_USEPrivate                -0.7845867  0.0895026  -8.766  < 2e-16 ***
## TIF                           -0.0556795  0.0073869  -7.538 4.79e-14 ***
## CAR_TYPEPanel Truck            0.5696467  0.1443959   3.945 7.98e-05 ***
## CAR_TYPEPickup                 0.5916173  0.1011371   5.850 4.93e-09 ***
## CAR_TYPESports Car             0.8893983  0.1091419   8.149 3.67e-16 ***
## CAR_TYPESUV                    0.7232583  0.0864527   8.366  < 2e-16 ***
## CAR_TYPEVan                    0.6839796  0.1226482   5.577 2.45e-08 ***
## REVOKEDYes                     0.8608141  0.0891811   9.652  < 2e-16 ***
## URBANICITYUrban                2.3653811  0.1133968  20.859  < 2e-16 ***
## HOME_VAL_Y1                    2.9662970  1.2829635   2.312 0.020774 *  
## OLDCLAIM_Y1                    1.8632701  0.4063386   4.586 4.53e-06 ***
## INCOME_LOG                    -0.0646728  0.0144620  -4.472 7.75e-06 ***
## HOME_VAL_LOG                  -0.2682673  0.1056822  -2.538 0.011135 *  
## BLUEBOOK_LOG                  -0.3303493  0.0556539  -5.936 2.92e-09 ***
## OLDCLAIM_LOG                  -0.1592683  0.0458819  -3.471 0.000518 ***
## AGE_Squared                    0.0022997  0.0002771   8.299  < 2e-16 ***
## KIDSDRIV_0.5                   0.6997619  0.0748697   9.346  < 2e-16 ***
## MVR_PTS_3                      0.0016625  0.0002616   6.355 2.09e-10 ***
## TRAVTIME_0.33                  0.4435298  0.0556466   7.970 1.58e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7198.4  on 8127  degrees of freedom
## AIC: 7266.4
## 
## Number of Fisher Scoring iterations: 5

Backward Elimination by BIC

Starting with our full model, perform backward elimination by comparing the BIC of the models.

logi_BIC <- step(logi_full,trace=0,k=log(nrow(logistic_train_df)))
summary(logi_BIC)
## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE + MSTATUS + JOB + CAR_USE + TIF + 
##     CAR_TYPE + REVOKED + CAR_AGE + URBANICITY + OLDCLAIM_Y + 
##     INCOME_LOG + HOME_VAL_LOG + BLUEBOOK_LOG + OLDCLAIM_LOG + 
##     AGE_Squared + KIDSDRIV_0.5 + MVR_PTS_3 + TRAVTIME_0.33, family = binomial, 
##     data = logistic_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7053  -0.7048  -0.3991   0.5995   3.0181  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          5.1562404  0.7823239   6.591 4.37e-11 ***
## AGE                 -0.2229706  0.0245599  -9.079  < 2e-16 ***
## MSTATUSYes          -0.6094354  0.0766188  -7.954 1.80e-15 ***
## JOBClerical          0.1589964  0.1044652   1.522 0.128008    
## JOBDoctor           -1.0214492  0.1942875  -5.257 1.46e-07 ***
## JOBHome Maker       -0.3433937  0.1591796  -2.157 0.030984 *  
## JOBLawyer           -0.3053249  0.1308346  -2.334 0.019613 *  
## JOBManager          -0.9350503  0.1166132  -8.018 1.07e-15 ***
## JOBProfessional     -0.3264372  0.1083441  -3.013 0.002587 ** 
## JOBStudent          -0.4415347  0.1482545  -2.978 0.002899 ** 
## CAR_USEPrivate      -0.7524262  0.0849223  -8.860  < 2e-16 ***
## TIF                 -0.0552870  0.0073589  -7.513 5.78e-14 ***
## CAR_TYPEPanel Truck  0.5874722  0.1408720   4.170 3.04e-05 ***
## CAR_TYPEPickup       0.6142354  0.0999528   6.145 7.98e-10 ***
## CAR_TYPESports Car   0.8880994  0.1087609   8.166 3.20e-16 ***
## CAR_TYPESUV          0.7257516  0.0861241   8.427  < 2e-16 ***
## CAR_TYPEVan          0.6678454  0.1214282   5.500 3.80e-08 ***
## REVOKEDYes           0.8572408  0.0888624   9.647  < 2e-16 ***
## CAR_AGE             -0.0197342  0.0063285  -3.118 0.001819 ** 
## URBANICITYUrban      2.3460506  0.1129775  20.766  < 2e-16 ***
## OLDCLAIM_Y1          1.8478524  0.4046282   4.567 4.95e-06 ***
## INCOME_LOG          -0.0759929  0.0141159  -5.383 7.31e-08 ***
## HOME_VAL_LOG        -0.0255439  0.0069727  -3.663 0.000249 ***
## BLUEBOOK_LOG        -0.3428736  0.0553806  -6.191 5.97e-10 ***
## OLDCLAIM_LOG        -0.1566729  0.0457065  -3.428 0.000608 ***
## AGE_Squared          0.0024327  0.0002732   8.906  < 2e-16 ***
## KIDSDRIV_0.5         0.7532340  0.0712011  10.579  < 2e-16 ***
## MVR_PTS_3            0.0016393  0.0002613   6.273 3.54e-10 ***
## TRAVTIME_0.33        0.4280523  0.0553681   7.731 1.07e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7238.2  on 8132  degrees of freedom
## AIC: 7296.2
## 
## Number of Fisher Scoring iterations: 5

Backward Elimination with Chi-square test

Starting with our full model, perform backward elimination with Chi-square test.

#Define a function to perform backward elimination with Chi-square test 
#using the significancy / alpha as one of the parameters
backward_chi <- function (train_df, significancy) {
  glm_string <- "TARGET_FLAG~.-TARGET_AMT"
  glm_formula <- as.formula(glm_string)
  
  repeat{
    drop1_chi <- drop1(glm(glm_formula, family=binomial, train_df), test="Chi")
  
    chi_result <- data.frame(preditors = rownames(drop1_chi)[-1],
             p_value = drop1_chi[-1,5])
    chi_result <- chi_result[order(chi_result$p_value,decreasing=TRUE),]
    
    if(chi_result[1,2] < significancy){
        break
    }
    else {
        glm_string <- paste0(glm_string,"-",chi_result[1,1])
        glm_formula <- as.formula(glm_string)
    }
  }
  return(glm_formula)
}

model with alpha 0.001 (based on Chi-square test)**

logi_chi_0.001 <- backward_chi(logistic_train_df, 0.001)
logi_chi_0.001 <- glm(logi_chi_0.001, family=binomial, logistic_train_df) 
summary(logi_chi_0.001)
## 
## Call:
## glm(formula = logi_chi_0.001, family = binomial, data = logistic_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6191  -0.7019  -0.3967   0.5973   3.0517  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    4.5472383  0.7887741   5.765 8.17e-09 ***
## AGE                           -0.2230854  0.0246142  -9.063  < 2e-16 ***
## MSTATUSYes                    -0.6344301  0.0769689  -8.243  < 2e-16 ***
## EDUCATIONHigh School           0.4774188  0.0836198   5.709 1.13e-08 ***
## EDUCATIONLess ThanHigh School  0.4724104  0.1078563   4.380 1.19e-05 ***
## EDUCATIONMasters               0.0278310  0.1203973   0.231 0.817191    
## EDUCATIONPhD                   0.2189363  0.1641822   1.333 0.182369    
## JOBClerical                    0.1268681  0.1049032   1.209 0.226516    
## JOBDoctor                     -1.0767104  0.2475024  -4.350 1.36e-05 ***
## JOBHome Maker                 -0.2760199  0.1638944  -1.684 0.092156 .  
## JOBLawyer                     -0.1755763  0.1678096  -1.046 0.295431    
## JOBManager                    -0.8119527  0.1309258  -6.202 5.59e-10 ***
## JOBProfessional               -0.1535110  0.1167957  -1.314 0.188727    
## JOBStudent                    -0.4405995  0.1485985  -2.965 0.003026 ** 
## CAR_USEPrivate                -0.7862542  0.0894799  -8.787  < 2e-16 ***
## TIF                           -0.0557758  0.0073789  -7.559 4.07e-14 ***
## CAR_TYPEPanel Truck            0.5451750  0.1440513   3.785 0.000154 ***
## CAR_TYPEPickup                 0.5924988  0.1010796   5.862 4.58e-09 ***
## CAR_TYPESports Car             0.8956242  0.1090053   8.216  < 2e-16 ***
## CAR_TYPESUV                    0.7260057  0.0863553   8.407  < 2e-16 ***
## CAR_TYPEVan                    0.6633982  0.1223797   5.421 5.93e-08 ***
## REVOKEDYes                     0.8598997  0.0890678   9.654  < 2e-16 ***
## URBANICITYUrban                2.3638832  0.1132547  20.872  < 2e-16 ***
## OLDCLAIM_Y1                    1.8730582  0.4061688   4.612 4.00e-06 ***
## INCOME_LOG                    -0.0720505  0.0142046  -5.072 3.93e-07 ***
## HOME_VAL_LOG                  -0.0242285  0.0069886  -3.467 0.000527 ***
## BLUEBOOK_LOG                  -0.3316303  0.0555233  -5.973 2.33e-09 ***
## OLDCLAIM_LOG                  -0.1600205  0.0458666  -3.489 0.000485 ***
## AGE_Squared                    0.0024332  0.0002737   8.889  < 2e-16 ***
## KIDSDRIV_0.5                   0.7563153  0.0714373  10.587  < 2e-16 ***
## MVR_PTS_3                      0.0016612  0.0002616   6.351 2.14e-10 ***
## TRAVTIME_0.33                  0.4373746  0.0555511   7.873 3.45e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7210.3  on 8129  degrees of freedom
## AIC: 7274.3
## 
## Number of Fisher Scoring iterations: 5

Model Selection

Since the data is imbalanced, we would not use the threshold for our model predictions.

In business, we don’t want to misclassify a person with high risk to be low risk. We also don’t want to lose customers by charging low risk people at a high-risk rate. Practically, we should use a cost matrix to determine the threshold for our classification. Since we don’t know the cost here, we will weight the Sensitivity and Specificity equally. We will find our optimal threshold that maximize the sum of Sensitivity and Specificity

logi_models <- data.frame(model=c(""),DF=c(0),AIC=c(0.0000),AUC=c(0.0000),
                          Optimal_Threhold=c(0.0000),Sensitivity=c(0.0000),
                          Specificity=c(0.0000),Sum_Sens_Spec=c(0.0000))
models <- list(logi_full, logi_AIC, logi_BIC, logi_chi_0.001)
model_names <- c("logi_full", "logi_AIC", "logi_BIC", "logi_chi_0.001")
for (i in c(1:length(models))) {
    logi_models[i,"model"] <- model_names[i]
    logi_models[i,"DF"] <- models[[i]]$df.residual
    logi_models[i,"AIC"] <- round(models[[i]]$aic,4)
    rocCurve <- roc(logistic_train_df$TARGET_FLAG, models[[i]]$fitted.values)
    logi_models[i,"AUC"] <- round(rocCurve$auc,4)
    roc_df <- data.frame(Sensitivity = rocCurve$sensitivities, 
                         Specificity = rocCurve$specificities,
                         Sum_Sens_Spec = rocCurve$sensitivities+rocCurve$specificities,
                         Thresholds = rocCurve$thresholds)
    roc_df <- roc_df[which.max(roc_df$Sum_Sens_Spec),]
    logi_models[i,"Optimal_Threhold"] <- roc_df$Thresholds
    logi_models[i,"Sensitivity"] <- roc_df$Sensitivity
    logi_models[i,"Specificity"] <- roc_df$Specificity
    logi_models[i,"Sum_Sens_Spec"] <- roc_df$Sum_Sens_Spec
}
logi_models

By comparing the AUC and the sum of Sensitivity and Specificity, the best model is logi_AIC. The performance of logi_AIC is very close to the full model. The full model should not be selected since it is less parsimonious.

The following is the ROC of the logi_AIC model

rocCurve <- roc(logistic_train_df$TARGET_FLAG, logi_AIC$fitted.values)
plot(rocCurve)

The following is the confusion matrix of the logi_AIC model

predicted_class <- ifelse(logi_AIC$fitted.values>logi_models[2,"Optimal_Threhold"],1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
                                      as.factor(train_df$TARGET_FLAG),positive = "1")
confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4617  591
##          1 1391 1562
##                                           
##                Accuracy : 0.7571          
##                  95% CI : (0.7477, 0.7664)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : 7.855e-06       
##                                           
##                   Kappa : 0.4414          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7255          
##             Specificity : 0.7685          
##          Pos Pred Value : 0.5290          
##          Neg Pred Value : 0.8865          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1914          
##    Detection Prevalence : 0.3618          
##       Balanced Accuracy : 0.7470          
##                                           
##        'Positive' Class : 1               
## 

From the below marginal plots, we see no lack of fit of our model

marginalModelPlots(logi_AIC,~KIDSDRIV_0.5 + AGE + BLUEBOOK_LOG + INCOME_LOG +
                    HOME_VAL_LOG + TRAVTIME_0.33 + TIF + OLDCLAIM_LOG + CLM_FREQ +
                     MVR_PTS_3 + CAR_AGE, layout =c(4,3))

The residual plot below also shows that the pearson residuals are independent with approximately constant variance, with only a few outliers.

#arm::binnedplot(x = fitted(logi_AIC), y = residuals(logi_AIC, type="pearson"),
arm::binnedplot(x = predict(logi_AIC, type="link"), y = residuals(logi_AIC, type="pearson"),                
                nclass = NULL, 
                xlab = "Mean of Link Function Value", 
                ylab = "Average Pearson residual", 
                main = "Binned residual plot", 
                cex.pts = 0.8, 
                col.pts = 1, 
                col.int = "gray")

We conclude that our optimal logistic model logi_AIC is valid

Linear Model

Preliminary model

First, let’s build a test model to check if any additional transformations are needed to build a valid model

lm_train_df <- train_df[train_df$TARGET_FLAG==1,]
lm_train_df$TARGET_FLAG <- NULL
lm_full <- lm(TARGET_AMT~., lm_train_df)
summary(lm_full)
## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = lm_train_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8154  -3205  -1492    471  99479 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -8294.8912  3513.3996  -2.361   0.0183 *  
## KIDSDRIV                       -234.3833   318.5517  -0.736   0.4619    
## AGE                              23.1846    21.8577   1.061   0.2889    
## HOMEKIDS                        239.4402   214.3595   1.117   0.2641    
## YOJ                               7.3693    72.6747   0.101   0.9192    
## PARENT1Yes                      310.6962   589.3268   0.527   0.5981    
## MSTATUSYes                     -843.3218   519.5911  -1.623   0.1047    
## SEXM                           1053.5283   633.5866   1.663   0.0965 .  
## EDUCATIONHigh School           -735.9043   515.6747  -1.427   0.1537    
## EDUCATIONLess ThanHigh School  -355.4430   653.0230  -0.544   0.5863    
## EDUCATIONMasters               1294.4823   796.1486   1.626   0.1041    
## EDUCATIONPhD                   2765.8499  1087.1973   2.544   0.0110 *  
## JOBClerical                    -233.1716   586.7995  -0.397   0.6911    
## JOBDoctor                     -3377.7663  1603.4557  -2.107   0.0353 *  
## JOBHome Maker                  -661.8363  1019.7723  -0.649   0.5164    
## JOBLawyer                     -1240.3331  1027.6515  -1.207   0.2276    
## JOBManager                    -1625.7257   835.6109  -1.946   0.0518 .  
## JOBProfessional                 262.4070   665.7051   0.394   0.6935    
## JOBStudent                     -192.8816   919.8533  -0.210   0.8339    
## TRAVTIME                         -0.4624    11.0895  -0.042   0.9667    
## CAR_USEPrivate                 -269.3929   509.2279  -0.529   0.5968    
## TIF                             -13.6367    42.5485  -0.320   0.7486    
## CAR_TYPEPanel Truck              88.8685   874.5999   0.102   0.9191    
## CAR_TYPEPickup                  -78.1660   596.7722  -0.131   0.8958    
## CAR_TYPESports Car              886.1392   735.9876   1.204   0.2287    
## CAR_TYPESUV                     591.4067   644.5484   0.918   0.3590    
## CAR_TYPEVan                     190.3336   759.2246   0.251   0.8021    
## RED_CARyes                     -159.9960   497.8451  -0.321   0.7480    
## CLM_FREQ                        -74.2320   237.5360  -0.313   0.7547    
## REVOKEDYes                     -889.8444   492.3692  -1.807   0.0709 .  
## MVR_PTS                         123.6897    70.2285   1.761   0.0783 .  
## CAR_AGE                         -98.0778    44.0188  -2.228   0.0260 *  
## URBANICITYUrban                  45.8823   757.2989   0.061   0.9517    
## YOJ_Y1                          556.5369  1636.0747   0.340   0.7338    
## INCOME_Y1                      1733.0355  3175.0021   0.546   0.5852    
## HOME_VAL_Y1                   -3132.6728  7868.7164  -0.398   0.6906    
## OLDCLAIM_Y1                    -962.1329  2438.8183  -0.395   0.6932    
## INCOME_LOG                     -222.2419   307.8439  -0.722   0.4704    
## HOME_VAL_LOG                    302.4478   652.8178   0.463   0.6432    
## BLUEBOOK_LOG                   1420.8187   329.4618   4.313 1.69e-05 ***
## OLDCLAIM_LOG                    118.4161   268.2680   0.441   0.6590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7691 on 2112 degrees of freedom
## Multiple R-squared:  0.03168,    Adjusted R-squared:  0.01334 
## F-statistic: 1.727 on 40 and 2112 DF,  p-value: 0.003256
par(mfrow=c(2,2))
plot(lm_full)

Additional Transformation

The plots show that there is a non-linear relationship between the response variable and the predictors.

Let’s see what transformation Box-Cox would suggest for the response variable.

bc <- boxcox(lm_full)

lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.02020202

It result indicates a log-transformation is appropriate.

lm_train_df$TARGET_AMT_LOG <- log(lm_train_df$TARGET_AMT)
lm_train_df$TARGET_AMT <- NULL

Buidling Models

Full Model

First we build a full model with all predictors

lm_full <- lm(TARGET_AMT_LOG~., lm_train_df)
summary(lm_full)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ ., data = lm_train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6771 -0.4051  0.0395  0.4046  3.3205 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    6.3579773  0.3684984  17.254  < 2e-16 ***
## KIDSDRIV                      -0.0408035  0.0334109  -1.221   0.2221    
## AGE                            0.0024703  0.0022925   1.078   0.2814    
## HOMEKIDS                       0.0291498  0.0224828   1.297   0.1949    
## YOJ                           -0.0083886  0.0076224  -1.101   0.2712    
## PARENT1Yes                     0.0311088  0.0618108   0.503   0.6148    
## MSTATUSYes                    -0.0913721  0.0544966  -1.677   0.0938 .  
## SEXM                           0.0900364  0.0664529   1.355   0.1756    
## EDUCATIONHigh School           0.0435382  0.0540859   0.805   0.4209    
## EDUCATIONLess ThanHigh School  0.0373117  0.0684915   0.545   0.5860    
## EDUCATIONMasters               0.1936436  0.0835030   2.319   0.0205 *  
## EDUCATIONPhD                   0.2939906  0.1140293   2.578   0.0100 ** 
## JOBClerical                   -0.0014639  0.0615457  -0.024   0.9810    
## JOBDoctor                     -0.1980688  0.1681764  -1.178   0.2390    
## JOBHome Maker                 -0.0815360  0.1069575  -0.762   0.4460    
## JOBLawyer                     -0.1529216  0.1077839  -1.419   0.1561    
## JOBManager                    -0.0763891  0.0876420  -0.872   0.3835    
## JOBProfessional                0.0274579  0.0698216   0.393   0.6942    
## JOBStudent                     0.0297514  0.0964776   0.308   0.7578    
## TRAVTIME                      -0.0003521  0.0011631  -0.303   0.7621    
## CAR_USEPrivate                 0.0066168  0.0534097   0.124   0.9014    
## TIF                           -0.0018637  0.0044626  -0.418   0.6763    
## CAR_TYPEPanel Truck            0.0208568  0.0917313   0.227   0.8202    
## CAR_TYPEPickup                 0.0281819  0.0625917   0.450   0.6526    
## CAR_TYPESports Car             0.0704750  0.0771931   0.913   0.3614    
## CAR_TYPESUV                    0.0878397  0.0676026   1.299   0.1940    
## CAR_TYPEVan                   -0.0263177  0.0796303  -0.330   0.7411    
## RED_CARyes                     0.0260249  0.0522158   0.498   0.6182    
## CLM_FREQ                      -0.0487660  0.0249137  -1.957   0.0504 .  
## REVOKEDYes                    -0.0616324  0.0516415  -1.193   0.2328    
## MVR_PTS                        0.0150781  0.0073658   2.047   0.0408 *  
## CAR_AGE                       -0.0014203  0.0046169  -0.308   0.7584    
## URBANICITYUrban                0.0479334  0.0794283   0.603   0.5463    
## YOJ_Y1                         0.2554196  0.1715976   1.488   0.1368    
## INCOME_Y1                      0.0247520  0.3330060   0.074   0.9408    
## HOME_VAL_Y1                    0.5573296  0.8253001   0.675   0.4996    
## OLDCLAIM_Y1                   -0.1568455  0.2557923  -0.613   0.5398    
## INCOME_LOG                    -0.0167623  0.0322878  -0.519   0.6037    
## HOME_VAL_LOG                  -0.0430669  0.0684700  -0.629   0.5294    
## BLUEBOOK_LOG                   0.1748635  0.0345552   5.060 4.55e-07 ***
## OLDCLAIM_LOG                   0.0273403  0.0281369   0.972   0.3313    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8067 on 2112 degrees of freedom
## Multiple R-squared:  0.0327, Adjusted R-squared:  0.01438 
## F-statistic: 1.785 on 40 and 2112 DF,  p-value: 0.001893
par(mfrow=c(2,2))
plot(lm_full)

The residual plots show that the relationship is now linear. The only problem is that the distribution of the residuals is over-dispersed and not normal. Since the optimal transformation suggested by Box-Cox would not fix this problem, a GLM regression would be more appropriate to fit the data in this case. As requested by this assignment, we would keep the linear models. Since the normality of the residuals is violated, we would not judge the significance of the coefficient by the t-values. We will compare the performance of different models by the adjusted R-squared and the Root of Mean Square Errors.

Backward Elinmination By AIC

Starting with our full model, perform backward elimination by comparing the AIC of the models.

lm_AIC <- step(lm_full, trace = 0)
summary(lm_AIC)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ MSTATUS + SEX + CLM_FREQ + MVR_PTS + 
##     BLUEBOOK_LOG, data = lm_train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6994 -0.4043  0.0425  0.4106  3.2129 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.807358   0.248943  27.345  < 2e-16 ***
## MSTATUSYes   -0.073271   0.034682  -2.113   0.0347 *  
## SEXM          0.054122   0.034990   1.547   0.1221    
## CLM_FREQ     -0.022761   0.014548  -1.565   0.1178    
## MVR_PTS       0.017331   0.007044   2.460   0.0140 *  
## BLUEBOOK_LOG  0.156247   0.026314   5.938 3.36e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.804 on 2147 degrees of freedom
## Multiple R-squared:  0.02314,    Adjusted R-squared:  0.02087 
## F-statistic: 10.17 on 5 and 2147 DF,  p-value: 1.201e-09

Backward Elinmination By BIC

Starting with our full model, perform backward elimination by comparing the BIC of the models.

lm_BIC <- step(lm_full, trace = 0, k = log(nrow(lm_train_df)))
summary(lm_BIC)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ BLUEBOOK_LOG, data = lm_train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7888 -0.3907  0.0425  0.3914  3.2417 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.77803    0.24679  27.465  < 2e-16 ***
## BLUEBOOK_LOG  0.15976    0.02626   6.085 1.38e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8058 on 2151 degrees of freedom
## Multiple R-squared:  0.01692,    Adjusted R-squared:  0.01646 
## F-statistic: 37.02 on 1 and 2151 DF,  p-value: 1.377e-09

Model with only characteristics of the cars

The model produced by backward elimination comparing BIC showed that the book value of the car is the most important factor determining the claim value. It is reasonable that the claim value is highly related to the car’s value, but what about the other characteristics of the cars?
Let’s build a model with only the characteristics of the cars, which means the behaviors of the drivers are not considered.

lm_car <- lm(TARGET_AMT_LOG~CAR_USE+BLUEBOOK_LOG+CAR_TYPE+RED_CAR+CAR_AGE, data = lm_train_df)
summary(lm_car)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ CAR_USE + BLUEBOOK_LOG + CAR_TYPE + 
##     RED_CAR + CAR_AGE, data = lm_train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7800 -0.4013  0.0394  0.3963  3.2326 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.7691908  0.3035109  22.303  < 2e-16 ***
## CAR_USEPrivate       0.0068413  0.0414216   0.165    0.869    
## BLUEBOOK_LOG         0.1558630  0.0315517   4.940 8.42e-07 ***
## CAR_TYPEPanel Truck  0.0711456  0.0847287   0.840    0.401    
## CAR_TYPEPickup       0.0362248  0.0608058   0.596    0.551    
## CAR_TYPESports Car   0.0214223  0.0667991   0.321    0.748    
## CAR_TYPESUV          0.0349160  0.0570842   0.612    0.541    
## CAR_TYPEVan          0.0017551  0.0759781   0.023    0.982    
## RED_CARyes           0.0549828  0.0463057   1.187    0.235    
## CAR_AGE             -0.0001856  0.0032423  -0.057    0.954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8068 on 2143 degrees of freedom
## Multiple R-squared:  0.0183, Adjusted R-squared:  0.01417 
## F-statistic: 4.438 on 9 and 2143 DF,  p-value: 8.838e-06

Model Selection

lm_models <- data.frame(model=c(""),Num_of_Coefficients=c(0),
                        R_squared_adj=c(0.0000), RMSE=c(0.0000))
models <- list(lm_full, lm_AIC, lm_BIC, lm_car)
model_names <- c("lm_full", "lm_AIC", "lm_BIC", "lm_car")
for (i in c(1:length(models))) {
    lm_models[i,"model"] <- model_names[i]
    lm_models[i,"Num_of_Coefficients"] <- length(models[[i]]$coefficients) - 1
    lm_models[i,"R_squared_adj"] <- summary(models[[i]])$adj.r.squared
    lm_models[i,"RMSE"] <- sqrt(mean(models[[i]]$residuals^2))
}
lm_models

Model lm_AIC has the highest adjusted R-squared and the RMSE is very close to the full model. Our optimal linear model is lm_AIC. The performance of the model using only characteristics of the cars is the lowest. The behaviors of the driver do affect the amount of a claim.

par(mfrow=c(2,2))
plot(lm_AIC)

Double checking the diagnostic plots of lm_AIC. There is no serious problem except the non-normal residuals.
The coefficients are unbiased since the relationship is linear and the residuals are independent.
To check the significance of the coefficients, bootstrap simulation may be used but we won’t go further in this analysis.

Evaluation Data Prediction

Data Clean Up and Transformation

test_df$INDEX <- NULL
test_df$INCOME <- as.numeric(gsub('[$,]', '', test_df$INCOME))
test_df$HOME_VAL <- as.numeric(gsub('[$,]', '', test_df$HOME_VAL))
test_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', test_df$BLUEBOOK))
test_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', test_df$OLDCLAIM))
test_df$PARENT1 <- gsub("z_", "", test_df$PARENT1)
test_df$MSTATUS <- gsub("z_", "", test_df$MSTATUS)
test_df$SEX <- gsub("z_", "", test_df$SEX)
test_df$EDUCATION <- gsub("z_", "", test_df$EDUCATION)
test_df$EDUCATION <- gsub("<", "Less Than", test_df$EDUCATION)
test_df$JOB <- gsub("z_", "", test_df$JOB)
test_df$CAR_TYPE <- gsub("z_", "", test_df$CAR_TYPE)
test_df$URBANICITY <- ifelse(test_df$URBANICITY == "Highly Urban/ Urban", "Urban","Rural")
test_df$JOB[test_df$JOB == ""] <- NA
test_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX", "EDUCATION", "JOB","CAR_TYPE",
           "RED_CAR", "URBANICITY", "CAR_USE","REVOKED")] <-             
                lapply(test_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX", 
                                   "EDUCATION", "JOB","CAR_TYPE", "RED_CAR",
                                   "URBANICITY", "CAR_USE","REVOKED")], factor)
test_df$TARGET_FLAG <- NULL
test_df$TARGET_AMT <- NULL
test_df <- mice.reuse(mickey, test_df, maxit = 5, printFlag = FALSE, seed = 2022)[[1]]
summary(test_df)
##     KIDSDRIV           AGE           HOMEKIDS           YOJ       
##  Min.   :0.0000   Min.   :17.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:39.00   1st Qu.:0.0000   1st Qu.: 9.00  
##  Median :0.0000   Median :45.00   Median :0.0000   Median :11.00  
##  Mean   :0.1625   Mean   :45.02   Mean   :0.7174   Mean   :10.35  
##  3rd Qu.:0.0000   3rd Qu.:51.00   3rd Qu.:1.0000   3rd Qu.:13.00  
##  Max.   :3.0000   Max.   :73.00   Max.   :5.0000   Max.   :19.00  
##                                                                   
##      INCOME       PARENT1       HOME_VAL      MSTATUS    SEX     
##  Min.   :     0   No :1875   Min.   :     0   No : 847   F:1170  
##  1st Qu.: 25846   Yes: 266   1st Qu.:     0   Yes:1294   M: 971  
##  Median : 52064              Median :159144                      
##  Mean   : 60475              Mean   :153161                      
##  3rd Qu.: 86393              3rd Qu.:236150                      
##  Max.   :291182              Max.   :669271                      
##                                                                  
##                 EDUCATION             JOB         TRAVTIME     
##  Bachelors           :581   Blue Collar :473   Min.   :  5.00  
##  High School         :622   Clerical    :325   1st Qu.: 22.00  
##  Less ThanHigh School:312   Manager     :324   Median : 33.00  
##  Masters             :420   Professional:303   Mean   : 33.15  
##  PhD                 :206   Lawyer      :229   3rd Qu.: 43.00  
##                             Home Maker  :204   Max.   :105.00  
##                             (Other)     :283                   
##        CAR_USE        BLUEBOOK          TIF                CAR_TYPE  
##  Commercial: 760   Min.   : 1500   Min.   : 1.000   Minivan    :549  
##  Private   :1381   1st Qu.: 8870   1st Qu.: 1.000   Panel Truck:177  
##                    Median :14170   Median : 4.000   Pickup     :383  
##                    Mean   :15469   Mean   : 5.245   Sports Car :272  
##                    3rd Qu.:21050   3rd Qu.: 7.000   SUV        :589  
##                    Max.   :49940   Max.   :25.000   Van        :171  
##                                                                      
##  RED_CAR       OLDCLAIM        CLM_FREQ     REVOKED       MVR_PTS      
##  no :1543   Min.   :    0   Min.   :0.000   No :1880   Min.   : 0.000  
##  yes: 598   1st Qu.:    0   1st Qu.:0.000   Yes: 261   1st Qu.: 0.000  
##             Median :    0   Median :0.000              Median : 1.000  
##             Mean   : 4022   Mean   :0.809              Mean   : 1.766  
##             3rd Qu.: 4718   3rd Qu.:2.000              3rd Qu.: 3.000  
##             Max.   :54399   Max.   :5.000              Max.   :12.000  
##                                                                        
##     CAR_AGE       URBANICITY  
##  Min.   : 0.000   Rural: 403  
##  1st Qu.: 1.000   Urban:1738  
##  Median : 8.000               
##  Mean   : 8.192               
##  3rd Qu.:12.000               
##  Max.   :26.000               
## 
test_df$YOJ_Y <- as.factor(ifelse(test_df$YOJ == 0,0,1))
test_df$INCOME_Y <- as.factor(ifelse(test_df$INCOME == 0,0,1))
test_df$HOME_VAL_Y <- as.factor(ifelse(test_df$HOME_VAL == 0,0,1))
test_df$OLDCLAIM_Y <- as.factor(ifelse(test_df$OLDCLAIM == 0,0,1))
test_df$INCOME_LOG <- log(test_df$INCOME+1)
test_df$HOME_VAL_LOG <- log(test_df$HOME_VAL+1)
test_df$BLUEBOOK_LOG <- log(test_df$BLUEBOOK)
test_df$OLDCLAIM_LOG <- log(test_df$OLDCLAIM+1)
test_df$INCOME <- NULL
test_df$HOME_VAL <- NULL
test_df$BLUEBOOK <- NULL
test_df$OLDCLAIM <- NULL
logistic_test_df <- test_df
logistic_test_df$AGE_Squared <- logistic_test_df$AGE^2
logistic_test_df$KIDSDRIV_0.5 <- (logistic_test_df$KIDSDRIV)^0.5
logistic_test_df$HOMEKIDS_0.5 <- (logistic_test_df$HOMEKIDS)^0.5
logistic_test_df$MVR_PTS_3 <- (logistic_test_df$MVR_PTS)^3
logistic_test_df$TRAVTIME_0.33 <- (logistic_test_df$TRAVTIME)^0.33
logistic_test_df$KIDSDRIV <- NULL
logistic_test_df$HOMEKIDS <- NULL
logistic_test_df$MVR_PTS <- NULL
logistic_test_df$TRAVTIME <- NULL

Claim classification Prediction

logistic_test_df$TARGET_FLAG <- 
  ifelse(predict(logi_AIC,logistic_test_df, type="response")>
           logi_models[2,"Optimal_Threhold"],1,0)
test_predict <- logistic_test_df$TARGET_FLAG
train_predict <- ifelse(logi_AIC$fitted.values>logi_models[2,"Optimal_Threhold"],1,0)
dist_df <- data.frame(rbind(
      cbind(train_predict,"train_predict"),
      cbind(test_predict,"test_predict")
      ))
colnames(dist_df) <- c("value","data")
dist_df <- table(dist_df)
dist_df[,1] <- dist_df[,1]/sum(dist_df[,1])
dist_df[,2] <- dist_df[,2]/sum(dist_df[,2])
dist_df
##      data
## value test_predict train_predict
##     0    0.6268099     0.6381571
##     1    0.3731901     0.3618429

The model produces similar result for both the training and testing data. Around 60% of the cases are classified as no crash and 40% of the cases are classified as having a crash. Our logistic model has similar performance for predicting unseen results.

Claim Amount Prediction

lm_test_df <- test_df
lm_test_df$TARGET_FLAG <- NULL
lm_test_df$TARGET_AMT <- predict(lm_AIC,lm_test_df)
lm_test_df$TARGET_AMT <- exp(lm_test_df$TARGET_AMT)
test_df$TARGET_FLAG <- logistic_test_df$TARGET_FLAG 
test_df$TARGET_AMT <- lm_test_df$TARGET_AMT * logistic_test_df$TARGET_FLAG 
train_predict <- exp(lm_AIC$fitted.values)
test_predict <- test_df$TARGET_AMT[test_df$TARGET_AMT > 0]
dist_df <- data.frame(rbind(
      cbind(train_predict,"train_predict"),
      cbind(test_predict,"test_predict")
      ),stringsAsFactors=FALSE)
colnames(dist_df) <- c("value","data")
dist_df$value <- as.numeric(dist_df$value)
ggplot(dist_df, aes(x=value, color=data)) +
  geom_density()

The prediction of claim amounts have similar distributions for the training and testing data. Our linear model has stable performance in predicting unseen results.