train_df <- read.csv("https://raw.githubusercontent.com/jconno/R-data/master/insurance_training_data.csv")
test_df <- read.csv("https://raw.githubusercontent.com/jconno/R-data/master/insurance-evaluation-data.csv")
First, let’s correct the formats/values of the data
train_df$INDEX <- NULL
train_df$INCOME <- as.numeric(gsub('[$,]', '', train_df$INCOME))
train_df$HOME_VAL <- as.numeric(gsub('[$,]', '', train_df$HOME_VAL))
train_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', train_df$BLUEBOOK))
train_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', train_df$OLDCLAIM))
train_df$PARENT1 <- gsub("z_", "", train_df$PARENT1)
train_df$MSTATUS <- gsub("z_", "", train_df$MSTATUS)
train_df$SEX <- gsub("z_", "", train_df$SEX)
train_df$EDUCATION <- gsub("z_", "", train_df$EDUCATION)
train_df$EDUCATION <- gsub("<", "Less Than", train_df$EDUCATION)
train_df$JOB <- gsub("z_", "", train_df$JOB)
train_df$CAR_TYPE <- gsub("z_", "", train_df$CAR_TYPE)
train_df$URBANICITY <- ifelse(train_df$URBANICITY=="Highly Urban/ Urban","Urban","Rural")
train_df$JOB[train_df$JOB == ""] <- NA
train_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX", "EDUCATION", "JOB","CAR_TYPE",
"RED_CAR", "URBANICITY", "CAR_USE","REVOKED")] <-
lapply(train_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX",
"EDUCATION", "JOB","CAR_TYPE", "RED_CAR",
"URBANICITY", "CAR_USE","REVOKED")], factor)
Below is the summary of the cleaned up data.
summary(train_df)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0:6008 Min. : 0 Min. :0.0000 Min. :16.00 Min. :0.0000
## 1:2153 1st Qu.: 0 1st Qu.:0.0000 1st Qu.:39.00 1st Qu.:0.0000
## Median : 0 Median :0.0000 Median :45.00 Median :0.0000
## Mean : 1504 Mean :0.1711 Mean :44.79 Mean :0.7212
## 3rd Qu.: 1036 3rd Qu.:0.0000 3rd Qu.:51.00 3rd Qu.:1.0000
## Max. :107586 Max. :4.0000 Max. :81.00 Max. :5.0000
## NA's :6
## YOJ INCOME PARENT1 HOME_VAL MSTATUS
## Min. : 0.0 Min. : 0 No :7084 Min. : 0 No :3267
## 1st Qu.: 9.0 1st Qu.: 28097 Yes:1077 1st Qu.: 0 Yes:4894
## Median :11.0 Median : 54028 Median :161160
## Mean :10.5 Mean : 61898 Mean :154867
## 3rd Qu.:13.0 3rd Qu.: 85986 3rd Qu.:238724
## Max. :23.0 Max. :367030 Max. :885282
## NA's :454 NA's :445 NA's :464
## SEX EDUCATION JOB TRAVTIME
## F:4375 Bachelors :2242 Blue Collar :1825 Min. : 5.00
## M:3786 High School :2330 Clerical :1271 1st Qu.: 22.00
## Less ThanHigh School:1203 Professional:1117 Median : 33.00
## Masters :1658 Manager : 988 Mean : 33.49
## PhD : 728 Lawyer : 835 3rd Qu.: 44.00
## (Other) :1599 Max. :142.00
## NA's : 526
## CAR_USE BLUEBOOK TIF CAR_TYPE
## Commercial:3029 Min. : 1500 Min. : 1.000 Minivan :2145
## Private :5132 1st Qu.: 9280 1st Qu.: 1.000 Panel Truck: 676
## Median :14440 Median : 4.000 Pickup :1389
## Mean :15710 Mean : 5.351 Sports Car : 907
## 3rd Qu.:20850 3rd Qu.: 7.000 SUV :2294
## Max. :69740 Max. :25.000 Van : 750
##
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## no :5783 Min. : 0 Min. :0.0000 No :7161 Min. : 0.000
## yes:2378 1st Qu.: 0 1st Qu.:0.0000 Yes:1000 1st Qu.: 0.000
## Median : 0 Median :0.0000 Median : 1.000
## Mean : 4037 Mean :0.7986 Mean : 1.696
## 3rd Qu.: 4636 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :57037 Max. :5.0000 Max. :13.000
##
## CAR_AGE URBANICITY
## Min. :-3.000 Rural:1669
## 1st Qu.: 1.000 Urban:6492
## Median : 8.000
## Mean : 8.328
## 3rd Qu.:12.000
## Max. :28.000
## NA's :510
YOJ, INCOME,
HOME_VAL, CAR_AGE have a lot of
missing values, we will perform multiple imputations to fill in the
missing values.
CAR_AGE also has an incorrect value of -3. We will also
replace it by imputation.
data.m <- melt(train_df[c("TARGET_FLAG","KIDSDRIV","AGE","HOMEKIDS","YOJ","INCOME",
"HOME_VAL", "TRAVTIME","BLUEBOOK","TIF","OLDCLAIM","CLM_FREQ",
"MVR_PTS","CAR_AGE")], id.vars = 'TARGET_FLAG')
ggplot(data.m, aes(x = variable, y = value, fill = TARGET_FLAG)) + geom_boxplot() +
facet_wrap(~ variable, scales = 'free') + theme_classic()
The box plots show that a lot of numric variables are right skewed, we will transform the variables to reduce outliers.
plot_KIDSDRIV <- ggplot(train_df, aes(x=KIDSDRIV, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.3)
plot_AGE <- ggplot(train_df, aes(x=AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOMEKIDS <- ggplot(train_df, aes(x=HOMEKIDS, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.4)
plot_YOJ <- ggplot(train_df, aes(x=YOJ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_INCOME <- ggplot(train_df, aes(x=INCOME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOME_VAL <- ggplot(train_df, aes(x=HOME_VAL, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_TRAVTIME <- ggplot(train_df, aes(x=TRAVTIME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_BLUEBOOK <- ggplot(train_df, aes(x=BLUEBOOK, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_TIF <- ggplot(train_df, aes(x=TIF, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_OLDCLAIM <- ggplot(train_df, aes(x=OLDCLAIM, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_CLM_FREQ <- ggplot(train_df, aes(x=CLM_FREQ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.4)
plot_MVR_PTS <- ggplot(train_df, aes(x=MVR_PTS, color=TARGET_FLAG)) + geom_density(na.rm =TRUE, bw=0.4)
plots_CAR_AGE <- ggplot(train_df, aes(x=CAR_AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_KIDSDRIV+plot_AGE+plot_HOMEKIDS+plot_YOJ+plot_INCOME+plot_HOME_VAL+
plot_TRAVTIME+plot_BLUEBOOK+plot_TIF+plot_OLDCLAIM+plot_CLM_FREQ+
plot_MVR_PTS+plots_CAR_AGE+
plot_layout(ncol = 4, guides = "collect")
Most of the distributions are similar for target = 0 and target = 1. OLDCLAIM and CLM_FREQ are good candidates predicting whether there is a crash.
We can also look at the categorical variables:
plot_PARENT1 <- ggplot(train_df,aes(x=PARENT1,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_MSTATUS <- ggplot(train_df,aes(x=MSTATUS,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_SEX <- ggplot(train_df,aes(x=SEX,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_EDUCATION <- ggplot(train_df,aes(x=substring(train_df$EDUCATION,1,5),fill=TARGET_FLAG))+
geom_bar(position = position_dodge())+xlab("EDUCATION")
plot_JOB <- ggplot(train_df,aes(x=substring(train_df$JOB,1,2),fill=TARGET_FLAG))+
geom_bar(position = position_dodge())+xlab("JOB")
plot_CAR_TYPE <- ggplot(train_df,aes(x=substring(train_df$CAR_TYPE,1,4),fill=TARGET_FLAG))+
geom_bar(position = position_dodge())+xlab("CAR_TYPE")
plot_RED_CAR <- ggplot(train_df,aes(x=RED_CAR,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_URBANICITY <- ggplot(train_df,aes(x=URBANICITY,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_CAR_USE <- ggplot(train_df,aes(x=CAR_USE,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_REVOKED <- ggplot(train_df,aes(x=REVOKED,fill=TARGET_FLAG))+geom_bar(position = position_dodge())
plot_PARENT1+plot_MSTATUS+plot_SEX+plot_EDUCATION+plot_JOB+plot_CAR_TYPE+plot_RED_CAR+
plot_URBANICITY+plot_CAR_USE+plot_REVOKED+plot_layout(ncol = 3, guides = "collect")
PARENT1, MSTATUS, URBANICITY, CAR_USE AND REVOKED seem to have notable difference in the distributions between target = 0 and target = 1
Now let’s look at the correlations between the variables
corrplot::corrplot(cor(train_df[c("KIDSDRIV","AGE","HOMEKIDS","YOJ","INCOME",
"HOME_VAL","BLUEBOOK","TIF","OLDCLAIM", "CLM_FREQ",
"MVR_PTS","CAR_AGE")], use = "na.or.complete"),
method = 'number', type = 'lower', diag = FALSE, tl.srt = 0.1)
None of the variables have very strong correlations. There is no serious
problem of multi-collinearity.
#save the indicators of missing values. It will be used to verify the distributions
#of the imputed values
YOJ_NA <- is.na(train_df$YOJ)
INCOME_NA <- is.na(train_df$INCOME)
HOME_VAL_NA <- is.na(train_df$HOME_VAL)
CAR_AGE_NA <- is.na(train_df$CAR_AGE)
#remove incorrect CAR_AGE value for imputation
train_df$CAR_AGE[train_df$CAR_AGE < 0] <- NA
#temporary exclude TARGET_FLAG and TARGET_AMT in our imputation
TARGET_FLAG <- train_df$TARGET_FLAG
TARGET_AMT <- train_df$TARGET_AMT
train_df$TARGET_FLAG <- NULL
train_df$TARGET_AMT <- NULL
#save the imputation models to impute the test data set later
mickey <- parlmice(train_df, maxit = 5, m = 1, printFlag = FALSE, seed = 2022, cluster.seed = 2022)
#save the imputation result
train_df <- complete(mickey,1)
#Add TARGET_FLAG and TARGET_AMT back to our dataframe
train_df$TARGET_FLAG <- TARGET_FLAG
train_df$TARGET_AMT <- TARGET_AMT
TARGET_FLAG <- NULL
TARGET_AMT <- NULL
#write.csv(train_df,"train_df.csv", row.names = FALSE)
# train_df <- read.csv("train_df.csv", stringsAsFactors = TRUE)
# train_df$TARGET_FLAG <- as.factor(train_df$TARGET_FLAG)
We can compare the imputed data values and the original data
values.
The plots on the top row below show the distributions of the values from
the original data.
The plots on the bottom row below show the distributions of the imputed
values.
plot_YOJ <- ggplot(train_df[!YOJ_NA,], aes(x=YOJ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_INCOME <- ggplot(train_df[!INCOME_NA,], aes(x=INCOME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOME_VAL <- ggplot(train_df[!HOME_VAL_NA,], aes(x=HOME_VAL, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_CAR_AGE <- ggplot(train_df[!CAR_AGE_NA,], aes(x=CAR_AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_YOJ2 <- ggplot(train_df[YOJ_NA,], aes(x=YOJ, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_INCOME2 <- ggplot(train_df[INCOME_NA,], aes(x=INCOME, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_HOME_VAL2 <- ggplot(train_df[HOME_VAL_NA,], aes(x=HOME_VAL, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_CAR_AGE2 <- ggplot(train_df[CAR_AGE_NA,], aes(x=CAR_AGE, color=TARGET_FLAG)) + geom_density(na.rm =TRUE)
plot_YOJ+plot_INCOME+plot_HOME_VAL+plot_CAR_AGE+
plot_YOJ2+plot_INCOME2+plot_HOME_VAL2+plot_CAR_AGE2+
plot_layout(ncol = 4, guides = "collect")
The distributions look similar and so the imputed values are plausible.
YOJ: The density plot shows the variable is zero-inflated. The coefficient for YOJ=0 and the coefficient for YOJ>0 may be significantly different. Therefore, we would add a dummy variable indicating whether the variable is 0.
HOME_VAL: The variable is also zero-inflated, we would add a dummy variable indicating whether the person has a house.
INCOME: We would add a dummy variable indicating whether the person has a job. Practically, it is a key factor in insurance pricing.
OLDCLAIM: We would add a dummy variable indicating whether the person had an old claim. The coefficient for OLDCLAIM=0 and the coefficient for OLDCLAIM>0 may be significantly different.
INCOME, HOME_VAL, BLUEBOOK, OLDCLAIM: We will log transform all monetary variables as they are right-skewed.
train_df$YOJ_Y <- as.factor(ifelse(train_df$YOJ == 0,0,1))
train_df$INCOME_Y <- as.factor(ifelse(train_df$INCOME == 0,0,1))
train_df$HOME_VAL_Y <- as.factor(ifelse(train_df$HOME_VAL == 0,0,1))
train_df$OLDCLAIM_Y <- as.factor(ifelse(train_df$OLDCLAIM == 0,0,1))
train_df$INCOME_LOG <- log(train_df$INCOME+1)
train_df$HOME_VAL_LOG <- log(train_df$HOME_VAL+1)
train_df$BLUEBOOK_LOG <- log(train_df$BLUEBOOK)
train_df$OLDCLAIM_LOG <- log(train_df$OLDCLAIM+1)
train_df$INCOME <- NULL
train_df$HOME_VAL <- NULL
train_df$BLUEBOOK <- NULL
train_df$OLDCLAIM <- NULL
logistic_train_df <- train_df
First, let build a test model to see if any additional transformations are needed to fit our logistic models
test_model <- glm(TARGET_FLAG~.-TARGET_AMT, family = binomial, logistic_train_df)
marginalModelPlots(test_model,~KIDSDRIV + AGE + HOMEKIDS + YOJ + BLUEBOOK_LOG +
HOME_VAL_LOG + TRAVTIME+ INCOME_LOG + TIF + OLDCLAIM_LOG +
CLM_FREQ + MVR_PTS + CAR_AGE, layout =c(4,4))
Additional transformation are needed for KIDSDRIV, AGE, HOMEKIDS, TRAVTIME, and MVR_PTS
From the density plots above, the see that AGE is approximately normal for both target = 0 and target = 1. From the text book A Modern Approach To Regression With R, if the variance of the variable is different for the two response value, then a squared term should be added.
data.frame(Variance_of_AGE_TARGET0=c(var(logistic_train_df$AGE[logistic_train_df$TARGET_FLAG==0])),
Variance_of_AGE_TARGET1=c(var(logistic_train_df$AGE[logistic_train_df$TARGET_FLAG==1])))
var.test(AGE ~ TARGET_FLAG, logistic_train_df, alternative = "two.sided")
##
## F test to compare two variances
##
## data: AGE by TARGET_FLAG
## F = 0.73556, num df = 6007, denom df = 2152, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6856687 0.7881551
## sample estimates:
## ratio of variances
## 0.7355552
The variance is apparently different. We will add a squared term and check if that fits the model.
logistic_train_df$AGE_Squared <- logistic_train_df$AGE^2
For KIDSDRIV, HOMEKIDS, TRAVTIME, and MVR_PTS, power transformations are used and the powers are determined by trial and error
logistic_train_df$KIDSDRIV_0.5 <- (logistic_train_df$KIDSDRIV)^0.5
logistic_train_df$HOMEKIDS_0.5 <- (logistic_train_df$HOMEKIDS)^0.5
logistic_train_df$MVR_PTS_3 <- (logistic_train_df$MVR_PTS)^3
logistic_train_df$TRAVTIME_0.33 <- (logistic_train_df$TRAVTIME)^0.33
logistic_train_df$KIDSDRIV <- NULL
logistic_train_df$HOMEKIDS <- NULL
logistic_train_df$MVR_PTS <- NULL
logistic_train_df$TRAVTIME <- NULL
After all the transformations, the test model now fits our data well
test_model <- glm(TARGET_FLAG~.-TARGET_AMT, family = binomial, logistic_train_df)
marginalModelPlots(test_model,~KIDSDRIV_0.5 + AGE + HOMEKIDS_0.5 + YOJ + BLUEBOOK_LOG +
HOME_VAL_LOG + TRAVTIME_0.33 + INCOME_LOG + TIF + OLDCLAIM_LOG +
CLM_FREQ + MVR_PTS_3 + CAR_AGE, layout =c(4,4))
First we build a full model with all predictors
logi_full <- glm(TARGET_FLAG~.-TARGET_AMT, family = binomial, logistic_train_df)
summary(logi_full)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial,
## data = logistic_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6561 -0.7040 -0.3916 0.5906 3.0517
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.0329974 0.8904557 4.529 5.92e-06 ***
## AGE -0.2054268 0.0261087 -7.868 3.60e-15 ***
## YOJ 0.0094186 0.0121940 0.772 0.439882
## PARENT1Yes 0.2551747 0.1177500 2.167 0.030228 *
## MSTATUSYes -0.5506845 0.0907463 -6.068 1.29e-09 ***
## SEXM 0.0142482 0.1087067 0.131 0.895720
## EDUCATIONHigh School 0.4277573 0.0915449 4.673 2.97e-06 ***
## EDUCATIONLess ThanHigh School 0.3945122 0.1189492 3.317 0.000911 ***
## EDUCATIONMasters 0.0741044 0.1271528 0.583 0.560029
## EDUCATIONPhD 0.2959814 0.1703770 1.737 0.082349 .
## JOBClerical 0.0764184 0.1074750 0.711 0.477063
## JOBDoctor -1.0689510 0.2486535 -4.299 1.72e-05 ***
## JOBHome Maker -0.3877938 0.1773726 -2.186 0.028792 *
## JOBLawyer -0.1810184 0.1681735 -1.076 0.281758
## JOBManager -0.8139390 0.1312582 -6.201 5.61e-10 ***
## JOBProfessional -0.1443933 0.1171210 -1.233 0.217629
## JOBStudent -0.4648306 0.1679583 -2.768 0.005648 **
## CAR_USEPrivate -0.7841910 0.0895536 -8.757 < 2e-16 ***
## TIF -0.0558025 0.0073917 -7.549 4.37e-14 ***
## CAR_TYPEPanel Truck 0.5798982 0.1507360 3.847 0.000120 ***
## CAR_TYPEPickup 0.5938969 0.1011924 5.869 4.38e-09 ***
## CAR_TYPESports Car 0.8766221 0.1293159 6.779 1.21e-11 ***
## CAR_TYPESUV 0.7113846 0.1078339 6.597 4.19e-11 ***
## CAR_TYPEVan 0.6913626 0.1261029 5.483 4.19e-08 ***
## RED_CARyes -0.0529047 0.0874085 -0.605 0.545008
## CLM_FREQ 0.0512862 0.0448578 1.143 0.252912
## REVOKEDYes 0.8619659 0.0892525 9.658 < 2e-16 ***
## CAR_AGE -0.0049525 0.0076018 -0.651 0.514735
## URBANICITYUrban 2.3670806 0.1134796 20.859 < 2e-16 ***
## YOJ_Y1 -0.1591415 0.3094673 -0.514 0.607082
## INCOME_Y1 0.3383702 0.5821690 0.581 0.561090
## HOME_VAL_Y1 2.7303698 1.3884451 1.966 0.049242 *
## OLDCLAIM_Y1 1.7604446 0.4177531 4.214 2.51e-05 ***
## INCOME_LOG -0.0947646 0.0552415 -1.715 0.086261 .
## HOME_VAL_LOG -0.2489162 0.1144621 -2.175 0.029656 *
## BLUEBOOK_LOG -0.3311012 0.0596444 -5.551 2.84e-08 ***
## OLDCLAIM_LOG -0.1597333 0.0459161 -3.479 0.000504 ***
## AGE_Squared 0.0022731 0.0002855 7.962 1.69e-15 ***
## KIDSDRIV_0.5 0.6879048 0.0860413 7.995 1.30e-15 ***
## HOMEKIDS_0.5 0.0106102 0.0694718 0.153 0.878613
## MVR_PTS_3 0.0016706 0.0002621 6.373 1.85e-10 ***
## TRAVTIME_0.33 0.4424007 0.0556728 7.946 1.92e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7195.2 on 8119 degrees of freedom
## AIC: 7279.2
##
## Number of Fisher Scoring iterations: 5
Starting with our full model, perform backward elimination by comparing the AIC of the models.
logi_AIC <- step(logi_full,trace=0)
summary(logi_AIC)
##
## Call:
## glm(formula = TARGET_FLAG ~ AGE + PARENT1 + MSTATUS + EDUCATION +
## JOB + CAR_USE + TIF + CAR_TYPE + REVOKED + URBANICITY + HOME_VAL_Y +
## OLDCLAIM_Y + INCOME_LOG + HOME_VAL_LOG + BLUEBOOK_LOG + OLDCLAIM_LOG +
## AGE_Squared + KIDSDRIV_0.5 + MVR_PTS_3 + TRAVTIME_0.33, family = binomial,
## data = logistic_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6611 -0.7024 -0.3907 0.5939 3.0425
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.9489782 0.8090148 4.881 1.05e-06 ***
## AGE -0.2075123 0.0251120 -8.263 < 2e-16 ***
## PARENT1Yes 0.2749929 0.1016652 2.705 0.006833 **
## MSTATUSYes -0.5318116 0.0858020 -6.198 5.71e-10 ***
## EDUCATIONHigh School 0.4573110 0.0842096 5.431 5.62e-08 ***
## EDUCATIONLess ThanHigh School 0.4355342 0.1092115 3.988 6.66e-05 ***
## EDUCATIONMasters 0.0499713 0.1207856 0.414 0.679080
## EDUCATIONPhD 0.2709381 0.1658346 1.634 0.102304
## JOBClerical 0.0864041 0.1061130 0.814 0.415493
## JOBDoctor -1.0766600 0.2484480 -4.334 1.47e-05 ***
## JOBHome Maker -0.3531558 0.1677204 -2.106 0.035237 *
## JOBLawyer -0.1863971 0.1681038 -1.109 0.267507
## JOBManager -0.8162009 0.1311596 -6.223 4.88e-10 ***
## JOBProfessional -0.1469552 0.1170122 -1.256 0.209153
## JOBStudent -0.4183337 0.1484795 -2.817 0.004841 **
## CAR_USEPrivate -0.7845867 0.0895026 -8.766 < 2e-16 ***
## TIF -0.0556795 0.0073869 -7.538 4.79e-14 ***
## CAR_TYPEPanel Truck 0.5696467 0.1443959 3.945 7.98e-05 ***
## CAR_TYPEPickup 0.5916173 0.1011371 5.850 4.93e-09 ***
## CAR_TYPESports Car 0.8893983 0.1091419 8.149 3.67e-16 ***
## CAR_TYPESUV 0.7232583 0.0864527 8.366 < 2e-16 ***
## CAR_TYPEVan 0.6839796 0.1226482 5.577 2.45e-08 ***
## REVOKEDYes 0.8608141 0.0891811 9.652 < 2e-16 ***
## URBANICITYUrban 2.3653811 0.1133968 20.859 < 2e-16 ***
## HOME_VAL_Y1 2.9662970 1.2829635 2.312 0.020774 *
## OLDCLAIM_Y1 1.8632701 0.4063386 4.586 4.53e-06 ***
## INCOME_LOG -0.0646728 0.0144620 -4.472 7.75e-06 ***
## HOME_VAL_LOG -0.2682673 0.1056822 -2.538 0.011135 *
## BLUEBOOK_LOG -0.3303493 0.0556539 -5.936 2.92e-09 ***
## OLDCLAIM_LOG -0.1592683 0.0458819 -3.471 0.000518 ***
## AGE_Squared 0.0022997 0.0002771 8.299 < 2e-16 ***
## KIDSDRIV_0.5 0.6997619 0.0748697 9.346 < 2e-16 ***
## MVR_PTS_3 0.0016625 0.0002616 6.355 2.09e-10 ***
## TRAVTIME_0.33 0.4435298 0.0556466 7.970 1.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7198.4 on 8127 degrees of freedom
## AIC: 7266.4
##
## Number of Fisher Scoring iterations: 5
Starting with our full model, perform backward elimination by comparing the BIC of the models.
logi_BIC <- step(logi_full,trace=0,k=log(nrow(logistic_train_df)))
summary(logi_BIC)
##
## Call:
## glm(formula = TARGET_FLAG ~ AGE + MSTATUS + JOB + CAR_USE + TIF +
## CAR_TYPE + REVOKED + CAR_AGE + URBANICITY + OLDCLAIM_Y +
## INCOME_LOG + HOME_VAL_LOG + BLUEBOOK_LOG + OLDCLAIM_LOG +
## AGE_Squared + KIDSDRIV_0.5 + MVR_PTS_3 + TRAVTIME_0.33, family = binomial,
## data = logistic_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7053 -0.7048 -0.3991 0.5995 3.0181
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.1562404 0.7823239 6.591 4.37e-11 ***
## AGE -0.2229706 0.0245599 -9.079 < 2e-16 ***
## MSTATUSYes -0.6094354 0.0766188 -7.954 1.80e-15 ***
## JOBClerical 0.1589964 0.1044652 1.522 0.128008
## JOBDoctor -1.0214492 0.1942875 -5.257 1.46e-07 ***
## JOBHome Maker -0.3433937 0.1591796 -2.157 0.030984 *
## JOBLawyer -0.3053249 0.1308346 -2.334 0.019613 *
## JOBManager -0.9350503 0.1166132 -8.018 1.07e-15 ***
## JOBProfessional -0.3264372 0.1083441 -3.013 0.002587 **
## JOBStudent -0.4415347 0.1482545 -2.978 0.002899 **
## CAR_USEPrivate -0.7524262 0.0849223 -8.860 < 2e-16 ***
## TIF -0.0552870 0.0073589 -7.513 5.78e-14 ***
## CAR_TYPEPanel Truck 0.5874722 0.1408720 4.170 3.04e-05 ***
## CAR_TYPEPickup 0.6142354 0.0999528 6.145 7.98e-10 ***
## CAR_TYPESports Car 0.8880994 0.1087609 8.166 3.20e-16 ***
## CAR_TYPESUV 0.7257516 0.0861241 8.427 < 2e-16 ***
## CAR_TYPEVan 0.6678454 0.1214282 5.500 3.80e-08 ***
## REVOKEDYes 0.8572408 0.0888624 9.647 < 2e-16 ***
## CAR_AGE -0.0197342 0.0063285 -3.118 0.001819 **
## URBANICITYUrban 2.3460506 0.1129775 20.766 < 2e-16 ***
## OLDCLAIM_Y1 1.8478524 0.4046282 4.567 4.95e-06 ***
## INCOME_LOG -0.0759929 0.0141159 -5.383 7.31e-08 ***
## HOME_VAL_LOG -0.0255439 0.0069727 -3.663 0.000249 ***
## BLUEBOOK_LOG -0.3428736 0.0553806 -6.191 5.97e-10 ***
## OLDCLAIM_LOG -0.1566729 0.0457065 -3.428 0.000608 ***
## AGE_Squared 0.0024327 0.0002732 8.906 < 2e-16 ***
## KIDSDRIV_0.5 0.7532340 0.0712011 10.579 < 2e-16 ***
## MVR_PTS_3 0.0016393 0.0002613 6.273 3.54e-10 ***
## TRAVTIME_0.33 0.4280523 0.0553681 7.731 1.07e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7238.2 on 8132 degrees of freedom
## AIC: 7296.2
##
## Number of Fisher Scoring iterations: 5
Starting with our full model, perform backward elimination with Chi-square test.
#Define a function to perform backward elimination with Chi-square test
#using the significancy / alpha as one of the parameters
backward_chi <- function (train_df, significancy) {
glm_string <- "TARGET_FLAG~.-TARGET_AMT"
glm_formula <- as.formula(glm_string)
repeat{
drop1_chi <- drop1(glm(glm_formula, family=binomial, train_df), test="Chi")
chi_result <- data.frame(preditors = rownames(drop1_chi)[-1],
p_value = drop1_chi[-1,5])
chi_result <- chi_result[order(chi_result$p_value,decreasing=TRUE),]
if(chi_result[1,2] < significancy){
break
}
else {
glm_string <- paste0(glm_string,"-",chi_result[1,1])
glm_formula <- as.formula(glm_string)
}
}
return(glm_formula)
}
model with alpha 0.001 (based on Chi-square test)**
logi_chi_0.001 <- backward_chi(logistic_train_df, 0.001)
logi_chi_0.001 <- glm(logi_chi_0.001, family=binomial, logistic_train_df)
summary(logi_chi_0.001)
##
## Call:
## glm(formula = logi_chi_0.001, family = binomial, data = logistic_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6191 -0.7019 -0.3967 0.5973 3.0517
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.5472383 0.7887741 5.765 8.17e-09 ***
## AGE -0.2230854 0.0246142 -9.063 < 2e-16 ***
## MSTATUSYes -0.6344301 0.0769689 -8.243 < 2e-16 ***
## EDUCATIONHigh School 0.4774188 0.0836198 5.709 1.13e-08 ***
## EDUCATIONLess ThanHigh School 0.4724104 0.1078563 4.380 1.19e-05 ***
## EDUCATIONMasters 0.0278310 0.1203973 0.231 0.817191
## EDUCATIONPhD 0.2189363 0.1641822 1.333 0.182369
## JOBClerical 0.1268681 0.1049032 1.209 0.226516
## JOBDoctor -1.0767104 0.2475024 -4.350 1.36e-05 ***
## JOBHome Maker -0.2760199 0.1638944 -1.684 0.092156 .
## JOBLawyer -0.1755763 0.1678096 -1.046 0.295431
## JOBManager -0.8119527 0.1309258 -6.202 5.59e-10 ***
## JOBProfessional -0.1535110 0.1167957 -1.314 0.188727
## JOBStudent -0.4405995 0.1485985 -2.965 0.003026 **
## CAR_USEPrivate -0.7862542 0.0894799 -8.787 < 2e-16 ***
## TIF -0.0557758 0.0073789 -7.559 4.07e-14 ***
## CAR_TYPEPanel Truck 0.5451750 0.1440513 3.785 0.000154 ***
## CAR_TYPEPickup 0.5924988 0.1010796 5.862 4.58e-09 ***
## CAR_TYPESports Car 0.8956242 0.1090053 8.216 < 2e-16 ***
## CAR_TYPESUV 0.7260057 0.0863553 8.407 < 2e-16 ***
## CAR_TYPEVan 0.6633982 0.1223797 5.421 5.93e-08 ***
## REVOKEDYes 0.8598997 0.0890678 9.654 < 2e-16 ***
## URBANICITYUrban 2.3638832 0.1132547 20.872 < 2e-16 ***
## OLDCLAIM_Y1 1.8730582 0.4061688 4.612 4.00e-06 ***
## INCOME_LOG -0.0720505 0.0142046 -5.072 3.93e-07 ***
## HOME_VAL_LOG -0.0242285 0.0069886 -3.467 0.000527 ***
## BLUEBOOK_LOG -0.3316303 0.0555233 -5.973 2.33e-09 ***
## OLDCLAIM_LOG -0.1600205 0.0458666 -3.489 0.000485 ***
## AGE_Squared 0.0024332 0.0002737 8.889 < 2e-16 ***
## KIDSDRIV_0.5 0.7563153 0.0714373 10.587 < 2e-16 ***
## MVR_PTS_3 0.0016612 0.0002616 6.351 2.14e-10 ***
## TRAVTIME_0.33 0.4373746 0.0555511 7.873 3.45e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7210.3 on 8129 degrees of freedom
## AIC: 7274.3
##
## Number of Fisher Scoring iterations: 5
Since the data is imbalanced, we would not use the threshold for our model predictions.
In business, we don’t want to misclassify a person with high risk to be low risk. We also don’t want to lose customers by charging low risk people at a high-risk rate. Practically, we should use a cost matrix to determine the threshold for our classification. Since we don’t know the cost here, we will weight the Sensitivity and Specificity equally. We will find our optimal threshold that maximize the sum of Sensitivity and Specificity
logi_models <- data.frame(model=c(""),DF=c(0),AIC=c(0.0000),AUC=c(0.0000),
Optimal_Threhold=c(0.0000),Sensitivity=c(0.0000),
Specificity=c(0.0000),Sum_Sens_Spec=c(0.0000))
models <- list(logi_full, logi_AIC, logi_BIC, logi_chi_0.001)
model_names <- c("logi_full", "logi_AIC", "logi_BIC", "logi_chi_0.001")
for (i in c(1:length(models))) {
logi_models[i,"model"] <- model_names[i]
logi_models[i,"DF"] <- models[[i]]$df.residual
logi_models[i,"AIC"] <- round(models[[i]]$aic,4)
rocCurve <- roc(logistic_train_df$TARGET_FLAG, models[[i]]$fitted.values)
logi_models[i,"AUC"] <- round(rocCurve$auc,4)
roc_df <- data.frame(Sensitivity = rocCurve$sensitivities,
Specificity = rocCurve$specificities,
Sum_Sens_Spec = rocCurve$sensitivities+rocCurve$specificities,
Thresholds = rocCurve$thresholds)
roc_df <- roc_df[which.max(roc_df$Sum_Sens_Spec),]
logi_models[i,"Optimal_Threhold"] <- roc_df$Thresholds
logi_models[i,"Sensitivity"] <- roc_df$Sensitivity
logi_models[i,"Specificity"] <- roc_df$Specificity
logi_models[i,"Sum_Sens_Spec"] <- roc_df$Sum_Sens_Spec
}
logi_models
By comparing the AUC and the sum of Sensitivity and Specificity, the best model is logi_AIC. The performance of logi_AIC is very close to the full model. The full model should not be selected since it is less parsimonious.
The following is the ROC of the logi_AIC model
rocCurve <- roc(logistic_train_df$TARGET_FLAG, logi_AIC$fitted.values)
plot(rocCurve)
The following is the confusion matrix of the logi_AIC model
predicted_class <- ifelse(logi_AIC$fitted.values>logi_models[2,"Optimal_Threhold"],1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
as.factor(train_df$TARGET_FLAG),positive = "1")
confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4617 591
## 1 1391 1562
##
## Accuracy : 0.7571
## 95% CI : (0.7477, 0.7664)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : 7.855e-06
##
## Kappa : 0.4414
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7255
## Specificity : 0.7685
## Pos Pred Value : 0.5290
## Neg Pred Value : 0.8865
## Prevalence : 0.2638
## Detection Rate : 0.1914
## Detection Prevalence : 0.3618
## Balanced Accuracy : 0.7470
##
## 'Positive' Class : 1
##
From the below marginal plots, we see no lack of fit of our model
marginalModelPlots(logi_AIC,~KIDSDRIV_0.5 + AGE + BLUEBOOK_LOG + INCOME_LOG +
HOME_VAL_LOG + TRAVTIME_0.33 + TIF + OLDCLAIM_LOG + CLM_FREQ +
MVR_PTS_3 + CAR_AGE, layout =c(4,3))
The residual plot below also shows that the pearson residuals are independent with approximately constant variance, with only a few outliers.
#arm::binnedplot(x = fitted(logi_AIC), y = residuals(logi_AIC, type="pearson"),
arm::binnedplot(x = predict(logi_AIC, type="link"), y = residuals(logi_AIC, type="pearson"),
nclass = NULL,
xlab = "Mean of Link Function Value",
ylab = "Average Pearson residual",
main = "Binned residual plot",
cex.pts = 0.8,
col.pts = 1,
col.int = "gray")
We conclude that our optimal logistic model logi_AIC is valid
First, let’s build a test model to check if any additional transformations are needed to build a valid model
lm_train_df <- train_df[train_df$TARGET_FLAG==1,]
lm_train_df$TARGET_FLAG <- NULL
lm_full <- lm(TARGET_AMT~., lm_train_df)
summary(lm_full)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = lm_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8154 -3205 -1492 471 99479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8294.8912 3513.3996 -2.361 0.0183 *
## KIDSDRIV -234.3833 318.5517 -0.736 0.4619
## AGE 23.1846 21.8577 1.061 0.2889
## HOMEKIDS 239.4402 214.3595 1.117 0.2641
## YOJ 7.3693 72.6747 0.101 0.9192
## PARENT1Yes 310.6962 589.3268 0.527 0.5981
## MSTATUSYes -843.3218 519.5911 -1.623 0.1047
## SEXM 1053.5283 633.5866 1.663 0.0965 .
## EDUCATIONHigh School -735.9043 515.6747 -1.427 0.1537
## EDUCATIONLess ThanHigh School -355.4430 653.0230 -0.544 0.5863
## EDUCATIONMasters 1294.4823 796.1486 1.626 0.1041
## EDUCATIONPhD 2765.8499 1087.1973 2.544 0.0110 *
## JOBClerical -233.1716 586.7995 -0.397 0.6911
## JOBDoctor -3377.7663 1603.4557 -2.107 0.0353 *
## JOBHome Maker -661.8363 1019.7723 -0.649 0.5164
## JOBLawyer -1240.3331 1027.6515 -1.207 0.2276
## JOBManager -1625.7257 835.6109 -1.946 0.0518 .
## JOBProfessional 262.4070 665.7051 0.394 0.6935
## JOBStudent -192.8816 919.8533 -0.210 0.8339
## TRAVTIME -0.4624 11.0895 -0.042 0.9667
## CAR_USEPrivate -269.3929 509.2279 -0.529 0.5968
## TIF -13.6367 42.5485 -0.320 0.7486
## CAR_TYPEPanel Truck 88.8685 874.5999 0.102 0.9191
## CAR_TYPEPickup -78.1660 596.7722 -0.131 0.8958
## CAR_TYPESports Car 886.1392 735.9876 1.204 0.2287
## CAR_TYPESUV 591.4067 644.5484 0.918 0.3590
## CAR_TYPEVan 190.3336 759.2246 0.251 0.8021
## RED_CARyes -159.9960 497.8451 -0.321 0.7480
## CLM_FREQ -74.2320 237.5360 -0.313 0.7547
## REVOKEDYes -889.8444 492.3692 -1.807 0.0709 .
## MVR_PTS 123.6897 70.2285 1.761 0.0783 .
## CAR_AGE -98.0778 44.0188 -2.228 0.0260 *
## URBANICITYUrban 45.8823 757.2989 0.061 0.9517
## YOJ_Y1 556.5369 1636.0747 0.340 0.7338
## INCOME_Y1 1733.0355 3175.0021 0.546 0.5852
## HOME_VAL_Y1 -3132.6728 7868.7164 -0.398 0.6906
## OLDCLAIM_Y1 -962.1329 2438.8183 -0.395 0.6932
## INCOME_LOG -222.2419 307.8439 -0.722 0.4704
## HOME_VAL_LOG 302.4478 652.8178 0.463 0.6432
## BLUEBOOK_LOG 1420.8187 329.4618 4.313 1.69e-05 ***
## OLDCLAIM_LOG 118.4161 268.2680 0.441 0.6590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7691 on 2112 degrees of freedom
## Multiple R-squared: 0.03168, Adjusted R-squared: 0.01334
## F-statistic: 1.727 on 40 and 2112 DF, p-value: 0.003256
par(mfrow=c(2,2))
plot(lm_full)
The plots show that there is a non-linear relationship between the response variable and the predictors.
Let’s see what transformation Box-Cox would suggest for the response variable.
bc <- boxcox(lm_full)
lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.02020202
It result indicates a log-transformation is appropriate.
lm_train_df$TARGET_AMT_LOG <- log(lm_train_df$TARGET_AMT)
lm_train_df$TARGET_AMT <- NULL
First we build a full model with all predictors
lm_full <- lm(TARGET_AMT_LOG~., lm_train_df)
summary(lm_full)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ ., data = lm_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6771 -0.4051 0.0395 0.4046 3.3205
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3579773 0.3684984 17.254 < 2e-16 ***
## KIDSDRIV -0.0408035 0.0334109 -1.221 0.2221
## AGE 0.0024703 0.0022925 1.078 0.2814
## HOMEKIDS 0.0291498 0.0224828 1.297 0.1949
## YOJ -0.0083886 0.0076224 -1.101 0.2712
## PARENT1Yes 0.0311088 0.0618108 0.503 0.6148
## MSTATUSYes -0.0913721 0.0544966 -1.677 0.0938 .
## SEXM 0.0900364 0.0664529 1.355 0.1756
## EDUCATIONHigh School 0.0435382 0.0540859 0.805 0.4209
## EDUCATIONLess ThanHigh School 0.0373117 0.0684915 0.545 0.5860
## EDUCATIONMasters 0.1936436 0.0835030 2.319 0.0205 *
## EDUCATIONPhD 0.2939906 0.1140293 2.578 0.0100 **
## JOBClerical -0.0014639 0.0615457 -0.024 0.9810
## JOBDoctor -0.1980688 0.1681764 -1.178 0.2390
## JOBHome Maker -0.0815360 0.1069575 -0.762 0.4460
## JOBLawyer -0.1529216 0.1077839 -1.419 0.1561
## JOBManager -0.0763891 0.0876420 -0.872 0.3835
## JOBProfessional 0.0274579 0.0698216 0.393 0.6942
## JOBStudent 0.0297514 0.0964776 0.308 0.7578
## TRAVTIME -0.0003521 0.0011631 -0.303 0.7621
## CAR_USEPrivate 0.0066168 0.0534097 0.124 0.9014
## TIF -0.0018637 0.0044626 -0.418 0.6763
## CAR_TYPEPanel Truck 0.0208568 0.0917313 0.227 0.8202
## CAR_TYPEPickup 0.0281819 0.0625917 0.450 0.6526
## CAR_TYPESports Car 0.0704750 0.0771931 0.913 0.3614
## CAR_TYPESUV 0.0878397 0.0676026 1.299 0.1940
## CAR_TYPEVan -0.0263177 0.0796303 -0.330 0.7411
## RED_CARyes 0.0260249 0.0522158 0.498 0.6182
## CLM_FREQ -0.0487660 0.0249137 -1.957 0.0504 .
## REVOKEDYes -0.0616324 0.0516415 -1.193 0.2328
## MVR_PTS 0.0150781 0.0073658 2.047 0.0408 *
## CAR_AGE -0.0014203 0.0046169 -0.308 0.7584
## URBANICITYUrban 0.0479334 0.0794283 0.603 0.5463
## YOJ_Y1 0.2554196 0.1715976 1.488 0.1368
## INCOME_Y1 0.0247520 0.3330060 0.074 0.9408
## HOME_VAL_Y1 0.5573296 0.8253001 0.675 0.4996
## OLDCLAIM_Y1 -0.1568455 0.2557923 -0.613 0.5398
## INCOME_LOG -0.0167623 0.0322878 -0.519 0.6037
## HOME_VAL_LOG -0.0430669 0.0684700 -0.629 0.5294
## BLUEBOOK_LOG 0.1748635 0.0345552 5.060 4.55e-07 ***
## OLDCLAIM_LOG 0.0273403 0.0281369 0.972 0.3313
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8067 on 2112 degrees of freedom
## Multiple R-squared: 0.0327, Adjusted R-squared: 0.01438
## F-statistic: 1.785 on 40 and 2112 DF, p-value: 0.001893
par(mfrow=c(2,2))
plot(lm_full)
The residual plots show that the relationship is now linear. The only problem is that the distribution of the residuals is over-dispersed and not normal. Since the optimal transformation suggested by Box-Cox would not fix this problem, a GLM regression would be more appropriate to fit the data in this case. As requested by this assignment, we would keep the linear models. Since the normality of the residuals is violated, we would not judge the significance of the coefficient by the t-values. We will compare the performance of different models by the adjusted R-squared and the Root of Mean Square Errors.
Starting with our full model, perform backward elimination by comparing the AIC of the models.
lm_AIC <- step(lm_full, trace = 0)
summary(lm_AIC)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ MSTATUS + SEX + CLM_FREQ + MVR_PTS +
## BLUEBOOK_LOG, data = lm_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6994 -0.4043 0.0425 0.4106 3.2129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.807358 0.248943 27.345 < 2e-16 ***
## MSTATUSYes -0.073271 0.034682 -2.113 0.0347 *
## SEXM 0.054122 0.034990 1.547 0.1221
## CLM_FREQ -0.022761 0.014548 -1.565 0.1178
## MVR_PTS 0.017331 0.007044 2.460 0.0140 *
## BLUEBOOK_LOG 0.156247 0.026314 5.938 3.36e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.804 on 2147 degrees of freedom
## Multiple R-squared: 0.02314, Adjusted R-squared: 0.02087
## F-statistic: 10.17 on 5 and 2147 DF, p-value: 1.201e-09
Starting with our full model, perform backward elimination by comparing the BIC of the models.
lm_BIC <- step(lm_full, trace = 0, k = log(nrow(lm_train_df)))
summary(lm_BIC)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ BLUEBOOK_LOG, data = lm_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7888 -0.3907 0.0425 0.3914 3.2417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.77803 0.24679 27.465 < 2e-16 ***
## BLUEBOOK_LOG 0.15976 0.02626 6.085 1.38e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8058 on 2151 degrees of freedom
## Multiple R-squared: 0.01692, Adjusted R-squared: 0.01646
## F-statistic: 37.02 on 1 and 2151 DF, p-value: 1.377e-09
The model produced by backward elimination comparing BIC showed that
the book value of the car is the most important factor determining the
claim value. It is reasonable that the claim value is highly related to
the car’s value, but what about the other characteristics of the
cars?
Let’s build a model with only the characteristics of the cars, which
means the behaviors of the drivers are not considered.
lm_car <- lm(TARGET_AMT_LOG~CAR_USE+BLUEBOOK_LOG+CAR_TYPE+RED_CAR+CAR_AGE, data = lm_train_df)
summary(lm_car)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ CAR_USE + BLUEBOOK_LOG + CAR_TYPE +
## RED_CAR + CAR_AGE, data = lm_train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7800 -0.4013 0.0394 0.3963 3.2326
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.7691908 0.3035109 22.303 < 2e-16 ***
## CAR_USEPrivate 0.0068413 0.0414216 0.165 0.869
## BLUEBOOK_LOG 0.1558630 0.0315517 4.940 8.42e-07 ***
## CAR_TYPEPanel Truck 0.0711456 0.0847287 0.840 0.401
## CAR_TYPEPickup 0.0362248 0.0608058 0.596 0.551
## CAR_TYPESports Car 0.0214223 0.0667991 0.321 0.748
## CAR_TYPESUV 0.0349160 0.0570842 0.612 0.541
## CAR_TYPEVan 0.0017551 0.0759781 0.023 0.982
## RED_CARyes 0.0549828 0.0463057 1.187 0.235
## CAR_AGE -0.0001856 0.0032423 -0.057 0.954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8068 on 2143 degrees of freedom
## Multiple R-squared: 0.0183, Adjusted R-squared: 0.01417
## F-statistic: 4.438 on 9 and 2143 DF, p-value: 8.838e-06
lm_models <- data.frame(model=c(""),Num_of_Coefficients=c(0),
R_squared_adj=c(0.0000), RMSE=c(0.0000))
models <- list(lm_full, lm_AIC, lm_BIC, lm_car)
model_names <- c("lm_full", "lm_AIC", "lm_BIC", "lm_car")
for (i in c(1:length(models))) {
lm_models[i,"model"] <- model_names[i]
lm_models[i,"Num_of_Coefficients"] <- length(models[[i]]$coefficients) - 1
lm_models[i,"R_squared_adj"] <- summary(models[[i]])$adj.r.squared
lm_models[i,"RMSE"] <- sqrt(mean(models[[i]]$residuals^2))
}
lm_models
Model lm_AIC has the highest adjusted R-squared and the RMSE is very close to the full model. Our optimal linear model is lm_AIC. The performance of the model using only characteristics of the cars is the lowest. The behaviors of the driver do affect the amount of a claim.
par(mfrow=c(2,2))
plot(lm_AIC)
Double checking the diagnostic plots of lm_AIC. There is no serious
problem except the non-normal residuals.
The coefficients are unbiased since the relationship is linear and the
residuals are independent.
To check the significance of the coefficients, bootstrap simulation may
be used but we won’t go further in this analysis.
test_df$INDEX <- NULL
test_df$INCOME <- as.numeric(gsub('[$,]', '', test_df$INCOME))
test_df$HOME_VAL <- as.numeric(gsub('[$,]', '', test_df$HOME_VAL))
test_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', test_df$BLUEBOOK))
test_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', test_df$OLDCLAIM))
test_df$PARENT1 <- gsub("z_", "", test_df$PARENT1)
test_df$MSTATUS <- gsub("z_", "", test_df$MSTATUS)
test_df$SEX <- gsub("z_", "", test_df$SEX)
test_df$EDUCATION <- gsub("z_", "", test_df$EDUCATION)
test_df$EDUCATION <- gsub("<", "Less Than", test_df$EDUCATION)
test_df$JOB <- gsub("z_", "", test_df$JOB)
test_df$CAR_TYPE <- gsub("z_", "", test_df$CAR_TYPE)
test_df$URBANICITY <- ifelse(test_df$URBANICITY == "Highly Urban/ Urban", "Urban","Rural")
test_df$JOB[test_df$JOB == ""] <- NA
test_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX", "EDUCATION", "JOB","CAR_TYPE",
"RED_CAR", "URBANICITY", "CAR_USE","REVOKED")] <-
lapply(test_df[c("TARGET_FLAG","PARENT1","MSTATUS", "SEX",
"EDUCATION", "JOB","CAR_TYPE", "RED_CAR",
"URBANICITY", "CAR_USE","REVOKED")], factor)
test_df$TARGET_FLAG <- NULL
test_df$TARGET_AMT <- NULL
test_df <- mice.reuse(mickey, test_df, maxit = 5, printFlag = FALSE, seed = 2022)[[1]]
summary(test_df)
## KIDSDRIV AGE HOMEKIDS YOJ
## Min. :0.0000 Min. :17.00 Min. :0.0000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.00
## Median :0.0000 Median :45.00 Median :0.0000 Median :11.00
## Mean :0.1625 Mean :45.02 Mean :0.7174 Mean :10.35
## 3rd Qu.:0.0000 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.00
## Max. :3.0000 Max. :73.00 Max. :5.0000 Max. :19.00
##
## INCOME PARENT1 HOME_VAL MSTATUS SEX
## Min. : 0 No :1875 Min. : 0 No : 847 F:1170
## 1st Qu.: 25846 Yes: 266 1st Qu.: 0 Yes:1294 M: 971
## Median : 52064 Median :159144
## Mean : 60475 Mean :153161
## 3rd Qu.: 86393 3rd Qu.:236150
## Max. :291182 Max. :669271
##
## EDUCATION JOB TRAVTIME
## Bachelors :581 Blue Collar :473 Min. : 5.00
## High School :622 Clerical :325 1st Qu.: 22.00
## Less ThanHigh School:312 Manager :324 Median : 33.00
## Masters :420 Professional:303 Mean : 33.15
## PhD :206 Lawyer :229 3rd Qu.: 43.00
## Home Maker :204 Max. :105.00
## (Other) :283
## CAR_USE BLUEBOOK TIF CAR_TYPE
## Commercial: 760 Min. : 1500 Min. : 1.000 Minivan :549
## Private :1381 1st Qu.: 8870 1st Qu.: 1.000 Panel Truck:177
## Median :14170 Median : 4.000 Pickup :383
## Mean :15469 Mean : 5.245 Sports Car :272
## 3rd Qu.:21050 3rd Qu.: 7.000 SUV :589
## Max. :49940 Max. :25.000 Van :171
##
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## no :1543 Min. : 0 Min. :0.000 No :1880 Min. : 0.000
## yes: 598 1st Qu.: 0 1st Qu.:0.000 Yes: 261 1st Qu.: 0.000
## Median : 0 Median :0.000 Median : 1.000
## Mean : 4022 Mean :0.809 Mean : 1.766
## 3rd Qu.: 4718 3rd Qu.:2.000 3rd Qu.: 3.000
## Max. :54399 Max. :5.000 Max. :12.000
##
## CAR_AGE URBANICITY
## Min. : 0.000 Rural: 403
## 1st Qu.: 1.000 Urban:1738
## Median : 8.000
## Mean : 8.192
## 3rd Qu.:12.000
## Max. :26.000
##
test_df$YOJ_Y <- as.factor(ifelse(test_df$YOJ == 0,0,1))
test_df$INCOME_Y <- as.factor(ifelse(test_df$INCOME == 0,0,1))
test_df$HOME_VAL_Y <- as.factor(ifelse(test_df$HOME_VAL == 0,0,1))
test_df$OLDCLAIM_Y <- as.factor(ifelse(test_df$OLDCLAIM == 0,0,1))
test_df$INCOME_LOG <- log(test_df$INCOME+1)
test_df$HOME_VAL_LOG <- log(test_df$HOME_VAL+1)
test_df$BLUEBOOK_LOG <- log(test_df$BLUEBOOK)
test_df$OLDCLAIM_LOG <- log(test_df$OLDCLAIM+1)
test_df$INCOME <- NULL
test_df$HOME_VAL <- NULL
test_df$BLUEBOOK <- NULL
test_df$OLDCLAIM <- NULL
logistic_test_df <- test_df
logistic_test_df$AGE_Squared <- logistic_test_df$AGE^2
logistic_test_df$KIDSDRIV_0.5 <- (logistic_test_df$KIDSDRIV)^0.5
logistic_test_df$HOMEKIDS_0.5 <- (logistic_test_df$HOMEKIDS)^0.5
logistic_test_df$MVR_PTS_3 <- (logistic_test_df$MVR_PTS)^3
logistic_test_df$TRAVTIME_0.33 <- (logistic_test_df$TRAVTIME)^0.33
logistic_test_df$KIDSDRIV <- NULL
logistic_test_df$HOMEKIDS <- NULL
logistic_test_df$MVR_PTS <- NULL
logistic_test_df$TRAVTIME <- NULL
logistic_test_df$TARGET_FLAG <-
ifelse(predict(logi_AIC,logistic_test_df, type="response")>
logi_models[2,"Optimal_Threhold"],1,0)
test_predict <- logistic_test_df$TARGET_FLAG
train_predict <- ifelse(logi_AIC$fitted.values>logi_models[2,"Optimal_Threhold"],1,0)
dist_df <- data.frame(rbind(
cbind(train_predict,"train_predict"),
cbind(test_predict,"test_predict")
))
colnames(dist_df) <- c("value","data")
dist_df <- table(dist_df)
dist_df[,1] <- dist_df[,1]/sum(dist_df[,1])
dist_df[,2] <- dist_df[,2]/sum(dist_df[,2])
dist_df
## data
## value test_predict train_predict
## 0 0.6268099 0.6381571
## 1 0.3731901 0.3618429
The model produces similar result for both the training and testing data. Around 60% of the cases are classified as no crash and 40% of the cases are classified as having a crash. Our logistic model has similar performance for predicting unseen results.
lm_test_df <- test_df
lm_test_df$TARGET_FLAG <- NULL
lm_test_df$TARGET_AMT <- predict(lm_AIC,lm_test_df)
lm_test_df$TARGET_AMT <- exp(lm_test_df$TARGET_AMT)
test_df$TARGET_FLAG <- logistic_test_df$TARGET_FLAG
test_df$TARGET_AMT <- lm_test_df$TARGET_AMT * logistic_test_df$TARGET_FLAG
train_predict <- exp(lm_AIC$fitted.values)
test_predict <- test_df$TARGET_AMT[test_df$TARGET_AMT > 0]
dist_df <- data.frame(rbind(
cbind(train_predict,"train_predict"),
cbind(test_predict,"test_predict")
),stringsAsFactors=FALSE)
colnames(dist_df) <- c("value","data")
dist_df$value <- as.numeric(dist_df$value)
ggplot(dist_df, aes(x=value, color=data)) +
geom_density()
The prediction of claim amounts have similar distributions for the training and testing data. Our linear model has stable performance in predicting unseen results.