Abstract
In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.
Keywords: insurance, data621
Data Exploration
knitr::opts_chunk$set(echo = TRUE)
library(e1071)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.4
library(VIF)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.4.4
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.4.4
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(binr)
# read data
train = read.csv(file="data/insurance_training_data.csv")
dim(train)
## [1] 8161 26
#transform data
#this step is necessary in order to analyze data as it is not clean
currencyconv = function(input) {
out = sub("\\$", "", input)
out = as.numeric(sub(",", "", out))
return(out)
}
# Replace spaces with underscores
underscore = function(input) {
out = sub(" ", "_", input)
return(out)
}
train = as.tbl(train) %>%
mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
currencyconv) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
underscore) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
as.factor) %>%
mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
#check data
summary(train) %>% kable() %>% kable_styling()
|
INDEX </th>
|
TARGET_FLAG
|
TARGET_AMT
|
KIDSDRIV </th>
|
AGE </th>
|
HOMEKIDS </th>
|
YOJ </th>
|
INCOME </th>
|
PARENT1
|
HOME_VAL </th>
|
MSTATUS
|
SEX
|
EDUCATION </th>
|
JOB </th>
|
TRAVTIME </th>
|
CAR_USE </th>
|
BLUEBOOK </th>
|
TIF </th>
|
CAR_TYPE </th>
|
RED_CAR
|
OLDCLAIM </th>
|
CLM_FREQ </th>
|
REVOKED
|
MVR_PTS </th>
|
CAR_AGE </th>
|
URBANICITY </th>
|
|
Min. : 1
|
0:6008
|
Min. : 0
|
Min. :0.0000
|
Min. :16.00
|
Min. :0.0000
|
Min. : 0.0
|
Min. : 0
|
No :7084
|
Min. : 0
|
Yes :4894
|
M :3786
|
<High_School :1203
|
z_Blue_Collar:1825
|
Min. : 5.00
|
Commercial:3029
|
Min. : 1500
|
Min. : 1.000
|
Minivan :2145
|
no :5783
|
Min. : 0
|
Min. :0.0000
|
No :7161
|
Min. : 0.000
|
Min. :-3.000
|
Highly_Urban/ Urban :6492
|
|
1st Qu.: 2559
|
1:2153
|
1st Qu.: 0
|
1st Qu.:0.0000
|
1st Qu.:39.00
|
1st Qu.:0.0000
|
1st Qu.: 9.0
|
1st Qu.: 28097
|
Yes:1077
|
1st Qu.: 0
|
z_No:3267
|
z_F:4375
|
Bachelors :2242
|
Clerical :1271
|
1st Qu.: 22.00
|
Private :5132
|
1st Qu.: 9280
|
1st Qu.: 1.000
|
Panel_Truck: 676
|
yes:2378
|
1st Qu.: 0
|
1st Qu.:0.0000
|
Yes:1000
|
1st Qu.: 0.000
|
1st Qu.: 1.000
|
z_Highly_Rural/ Rural:1669
|
|
Median : 5133
|
NA
|
Median : 0
|
Median :0.0000
|
Median :45.00
|
Median :0.0000
|
Median :11.0
|
Median : 54028
|
NA
|
Median :161160
|
NA
|
NA
|
Masters :1658
|
Professional :1117
|
Median : 33.00
|
NA
|
Median :14440
|
Median : 4.000
|
Pickup :1389
|
NA
|
Median : 0
|
Median :0.0000
|
NA
|
Median : 1.000
|
Median : 8.000
|
NA
|
|
Mean : 5152
|
NA
|
Mean : 1504
|
Mean :0.1711
|
Mean :44.79
|
Mean :0.7212
|
Mean :10.5
|
Mean : 61898
|
NA
|
Mean :154867
|
NA
|
NA
|
PhD : 728
|
Manager : 988
|
Mean : 33.49
|
NA
|
Mean :15710
|
Mean : 5.351
|
Sports_Car : 907
|
NA
|
Mean : 4037
|
Mean :0.7986
|
NA
|
Mean : 1.696
|
Mean : 8.328
|
NA
|
|
3rd Qu.: 7745
|
NA
|
3rd Qu.: 1036
|
3rd Qu.:0.0000
|
3rd Qu.:51.00
|
3rd Qu.:1.0000
|
3rd Qu.:13.0
|
3rd Qu.: 85986
|
NA
|
3rd Qu.:238724
|
NA
|
NA
|
z_High_School:2330
|
Lawyer : 835
|
3rd Qu.: 44.00
|
NA
|
3rd Qu.:20850
|
3rd Qu.: 7.000
|
Van : 750
|
NA
|
3rd Qu.: 4636
|
3rd Qu.:2.0000
|
NA
|
3rd Qu.: 3.000
|
3rd Qu.:12.000
|
NA
|
|
Max. :10302
|
NA
|
Max. :107586
|
Max. :4.0000
|
Max. :81.00
|
Max. :5.0000
|
Max. :23.0
|
Max. :367030
|
NA
|
Max. :885282
|
NA
|
NA
|
NA
|
Student : 712
|
Max. :142.00
|
NA
|
Max. :69740
|
Max. :25.000
|
z_SUV :2294
|
NA
|
Max. :57037
|
Max. :5.0000
|
NA
|
Max. :13.000
|
Max. :28.000
|
NA
|
|
NA
|
NA
|
NA
|
NA
|
NA’s :6
|
NA
|
NA’s :454
|
NA’s :445
|
NA
|
NA’s :464
|
NA
|
NA
|
NA
|
(Other) :1413
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA’s :510
|
NA
|
sapply(train, function(x) sum(is.na(x))) %>% kable() %>% kable_styling()
|
x
|
INDEX
|
0
|
TARGET_FLAG
|
0
|
TARGET_AMT
|
0
|
KIDSDRIV
|
0
|
AGE
|
6
|
HOMEKIDS
|
0
|
YOJ
|
454
|
INCOME
|
445
|
PARENT1
|
0
|
HOME_VAL
|
464
|
MSTATUS
|
0
|
SEX
|
0
|
EDUCATION
|
0
|
JOB
|
0
|
TRAVTIME
|
0
|
CAR_USE
|
0
|
BLUEBOOK
|
0
|
TIF
|
0
|
CAR_TYPE
|
0
|
RED_CAR
|
0
|
OLDCLAIM
|
0
|
CLM_FREQ
|
0
|
REVOKED
|
0
|
MVR_PTS
|
0
|
CAR_AGE
|
510
|
URBANICITY
|
0
|
# library(UpSetR)
#
# train %>% as_shadow_upset() %>% upset()
ntrain<-select_if(train, is.numeric)
ntrain %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density()
## Warning: Removed 1879 rows containing non-finite values (stat_density).

#
# trainnum <- dplyr::select_if(train, is.numeric)
#
# rcorr(as.matrix(trainnum))
# corrplot(cor(trainnum), method="square")
#
# # correlation test 1
# cor.test(trainnum$HOME_VAL,trainnum$INCOME,method="pearson")
#
# #NOT significant ignore
Data Preparation
# impute data for missing values
# use column mean for calculation
train$AGE[is.na(train$AGE)] <- mean(train$AGE, na.rm=TRUE)
train$YOJ[is.na(train$YOJ)] <- mean(train$YOJ, na.rm=TRUE)
train$HOME_VAL[is.na(train$HOME_VAL)] <- mean(train$HOME_VAL, na.rm=TRUE)
train$CAR_AGE[is.na(train$CAR_AGE)] <- mean(train$CAR_AGE, na.rm=TRUE)
train$INCOME[is.na(train$INCOME)] <- mean(train$INCOME, na.rm=TRUE)
#get complete cases
train <- train[complete.cases(train),]
train2<-train
# # transform data using log for skewed HOMEKIDS, MVR_PTS, OLDCLAIM, TIF, KIDSDRIVE and CLM_FREQ
train$HOMEKIDS <- log(train$HOMEKIDS+1)
train$MVR_PTS <- log(train$MVR_PTS+1)
train$OLDCLAIM <- log(train$OLDCLAIM+1)
train$TIF <- log(train$TIF+1)
train$KIDSDRIV <- log(train$KIDSDRIV+1)
train$CLM_FREQ <- log(train$CLM_FREQ+1)
#remove rad per correlation in prior section
train <- train[, !(colnames(train) %in% c("INDEX"))]
#
# #create variable
# train$new <- train$tax / (train$medv*10)
#
trainnum <- dplyr::select_if(train, is.numeric)
rcorr(as.matrix(trainnum))
## TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL
## TARGET_AMT 1.00 0.06 -0.04 0.07 -0.02 -0.06 -0.08
## KIDSDRIV 0.06 1.00 -0.08 0.49 0.05 -0.05 -0.02
## AGE -0.04 -0.08 1.00 -0.47 0.13 0.18 0.20
## HOMEKIDS 0.07 0.49 -0.47 1.00 0.08 -0.16 -0.11
## YOJ -0.02 0.05 0.13 0.08 1.00 0.27 0.26
## INCOME -0.06 -0.05 0.18 -0.16 0.27 1.00 0.54
## HOME_VAL -0.08 -0.02 0.20 -0.11 0.26 0.54 1.00
## TRAVTIME 0.03 0.01 0.01 -0.01 -0.02 -0.05 -0.03
## BLUEBOOK 0.00 -0.02 0.16 -0.11 0.14 0.42 0.25
## TIF -0.05 -0.01 0.00 0.00 0.02 -0.01 0.00
## OLDCLAIM 0.13 0.05 -0.04 0.05 -0.02 -0.07 -0.11
## CLM_FREQ 0.13 0.04 -0.03 0.04 -0.02 -0.05 -0.10
## MVR_PTS 0.13 0.05 -0.07 0.06 -0.03 -0.05 -0.07
## CAR_AGE -0.06 -0.05 0.17 -0.15 0.06 0.39 0.20
## TRAVTIME BLUEBOOK TIF OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT 0.03 0.00 -0.05 0.13 0.13 0.13 -0.06
## KIDSDRIV 0.01 -0.02 -0.01 0.05 0.04 0.05 -0.05
## AGE 0.01 0.16 0.00 -0.04 -0.03 -0.07 0.17
## HOMEKIDS -0.01 -0.11 0.00 0.05 0.04 0.06 -0.15
## YOJ -0.02 0.14 0.02 -0.02 -0.02 -0.03 0.06
## INCOME -0.05 0.42 -0.01 -0.07 -0.05 -0.05 0.39
## HOME_VAL -0.03 0.25 0.00 -0.11 -0.10 -0.07 0.20
## TRAVTIME 1.00 -0.02 -0.01 -0.01 0.00 0.01 -0.04
## BLUEBOOK -0.02 1.00 -0.01 -0.04 -0.04 -0.04 0.18
## TIF -0.01 -0.01 1.00 -0.03 -0.02 -0.04 0.00
## OLDCLAIM -0.01 -0.04 -0.03 1.00 0.93 0.44 -0.02
## CLM_FREQ 0.00 -0.04 -0.02 0.93 1.00 0.41 -0.01
## MVR_PTS 0.01 -0.04 -0.04 0.44 0.41 1.00 -0.01
## CAR_AGE -0.04 0.18 0.00 -0.02 -0.01 -0.01 1.00
##
## n= 8161
##
##
## P
## TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL
## TARGET_AMT 0.0000 0.0002 0.0000 0.0585 0.0000 0.0000
## KIDSDRIV 0.0000 0.0000 0.0000 0.0000 0.0000 0.0577
## AGE 0.0002 0.0000 0.0000 0.0000 0.0000 0.0000
## HOMEKIDS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## YOJ 0.0585 0.0000 0.0000 0.0000 0.0000 0.0000
## INCOME 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## HOME_VAL 0.0000 0.0577 0.0000 0.0000 0.0000 0.0000
## TRAVTIME 0.0115 0.5499 0.6342 0.4230 0.1362 0.0000 0.0018
## BLUEBOOK 0.6712 0.0415 0.0000 0.0000 0.0000 0.0000 0.0000
## TIF 0.0000 0.3832 0.9404 0.6725 0.0498 0.4889 0.7280
## OLDCLAIM 0.0000 0.0000 0.0004 0.0000 0.0987 0.0000 0.0000
## CLM_FREQ 0.0000 0.0000 0.0054 0.0002 0.0272 0.0000 0.0000
## MVR_PTS 0.0000 0.0000 0.0000 0.0000 0.0033 0.0000 0.0000
## CAR_AGE 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## TRAVTIME BLUEBOOK TIF OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT 0.0115 0.6712 0.0000 0.0000 0.0000 0.0000 0.0000
## KIDSDRIV 0.5499 0.0415 0.3832 0.0000 0.0000 0.0000 0.0000
## AGE 0.6342 0.0000 0.9404 0.0004 0.0054 0.0000 0.0000
## HOMEKIDS 0.4230 0.0000 0.6725 0.0000 0.0002 0.0000 0.0000
## YOJ 0.1362 0.0000 0.0498 0.0987 0.0272 0.0033 0.0000
## INCOME 0.0000 0.0000 0.4889 0.0000 0.0000 0.0000 0.0000
## HOME_VAL 0.0018 0.0000 0.7280 0.0000 0.0000 0.0000 0.0000
## TRAVTIME 0.1246 0.2945 0.6009 0.7501 0.5405 0.0009
## BLUEBOOK 0.1246 0.5420 0.0001 0.0003 0.0007 0.0000
## TIF 0.2945 0.5420 0.0147 0.0408 0.0006 0.9927
## OLDCLAIM 0.6009 0.0001 0.0147 0.0000 0.0000 0.0787
## CLM_FREQ 0.7501 0.0003 0.0408 0.0000 0.0000 0.2247
## MVR_PTS 0.5405 0.0007 0.0006 0.0000 0.0000 0.4250
## CAR_AGE 0.0009 0.0000 0.9927 0.0787 0.2247 0.4250
corrplot(cor(trainnum), method="square")

cor.test(trainnum$HOMEKIDS,trainnum$AGE,method="pearson")
##
## Pearson's product-moment correlation
##
## data: trainnum$HOMEKIDS and trainnum$AGE
## t = -48.338, df = 8159, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4885252 -0.4547891
## sample estimates:
## cor
## -0.4718298
train2<-train
Build Models LOGIT TARGET_FLAG
#MODEL 1
logit <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=train, family = "binomial" (link="logit"))
summary(logit)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5262 -0.7180 -0.3983 0.6545 3.1455
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.942e-01 3.293e-01 -2.412 0.015880 *
## KIDSDRIV 6.821e-01 1.103e-01 6.185 6.21e-10 ***
## AGE 4.736e-05 4.078e-03 0.012 0.990734
## HOMEKIDS 1.513e-01 8.300e-02 1.823 0.068320 .
## YOJ -1.353e-02 8.578e-03 -1.577 0.114756
## INCOME -3.457e-06 1.076e-06 -3.212 0.001317 **
## PARENT1Yes 3.295e-01 1.144e-01 2.881 0.003970 **
## HOME_VAL -1.323e-06 3.419e-07 -3.871 0.000109 ***
## MSTATUSz_No 5.146e-01 8.493e-02 6.059 1.37e-09 ***
## SEXz_F -8.929e-02 1.120e-01 -0.797 0.425327
## EDUCATIONBachelors -3.720e-01 1.154e-01 -3.223 0.001267 **
## EDUCATIONMasters -2.803e-01 1.785e-01 -1.570 0.116405
## EDUCATIONPhD -1.496e-01 2.135e-01 -0.701 0.483401
## EDUCATIONz_High_School 2.111e-02 9.487e-02 0.222 0.823945
## JOBClerical 3.986e-01 1.963e-01 2.030 0.042359 *
## JOBDoctor -4.227e-01 2.662e-01 -1.588 0.112286
## JOBHome_Maker 2.049e-01 2.099e-01 0.976 0.328988
## JOBLawyer 1.172e-01 1.693e-01 0.692 0.488652
## JOBManager -5.616e-01 1.712e-01 -3.280 0.001038 **
## JOBProfessional 1.673e-01 1.782e-01 0.939 0.347724
## JOBStudent 2.038e-01 2.140e-01 0.953 0.340799
## JOBz_Blue_Collar 3.101e-01 1.853e-01 1.674 0.094190 .
## TRAVTIME 1.483e-02 1.880e-03 7.890 3.02e-15 ***
## CAR_USEPrivate -7.604e-01 9.172e-02 -8.291 < 2e-16 ***
## BLUEBOOK -2.079e-05 5.255e-06 -3.956 7.63e-05 ***
## TIF -3.257e-01 4.138e-02 -7.869 3.56e-15 ***
## CAR_TYPEPanel_Truck 5.701e-01 1.613e-01 3.533 0.000410 ***
## CAR_TYPEPickup 5.578e-01 1.007e-01 5.540 3.03e-08 ***
## CAR_TYPESports_Car 1.031e+00 1.298e-01 7.942 2.00e-15 ***
## CAR_TYPEVan 6.158e-01 1.264e-01 4.872 1.10e-06 ***
## CAR_TYPEz_SUV 7.787e-01 1.111e-01 7.007 2.43e-12 ***
## RED_CARyes -5.766e-03 8.631e-02 -0.067 0.946741
## OLDCLAIM 6.763e-03 1.697e-02 0.398 0.690300
## CLM_FREQ 3.160e-01 1.277e-01 2.474 0.013363 *
## REVOKEDYes 7.242e-01 8.184e-02 8.850 < 2e-16 ***
## MVR_PTS 2.808e-01 4.202e-02 6.682 2.35e-11 ***
## CAR_AGE -1.807e-03 7.530e-03 -0.240 0.810372
## URBANICITYz_Highly_Rural/ Rural -2.371e+00 1.130e-01 -20.989 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7308.4 on 8123 degrees of freedom
## AIC: 7384.4
##
## Number of Fisher Scoring iterations: 5
exp(logit$coefficients)
## (Intercept) KIDSDRIV
## 0.45194170 1.97796637
## AGE HOMEKIDS
## 1.00004736 1.16334008
## YOJ INCOME
## 0.98656280 0.99999654
## PARENT1Yes HOME_VAL
## 1.39023185 0.99999868
## MSTATUSz_No SEXz_F
## 1.67300058 0.91457630
## EDUCATIONBachelors EDUCATIONMasters
## 0.68933598 0.75559308
## EDUCATIONPhD EDUCATIONz_High_School
## 0.86104231 1.02133113
## JOBClerical JOBDoctor
## 1.48969494 0.65526320
## JOBHome_Maker JOBLawyer
## 1.22734502 1.12438495
## JOBManager JOBProfessional
## 0.57030294 1.18213072
## JOBStudent JOBz_Blue_Collar
## 1.22609195 1.36361825
## TRAVTIME CAR_USEPrivate
## 1.01494543 0.46747121
## BLUEBOOK TIF
## 0.99997921 0.72204512
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 1.76835639 1.74677458
## CAR_TYPESports_Car CAR_TYPEVan
## 2.80272210 1.85105526
## CAR_TYPEz_SUV RED_CARyes
## 2.17863803 0.99425108
## OLDCLAIM CLM_FREQ
## 1.00678601 1.37166999
## REVOKEDYes MVR_PTS
## 2.06310705 1.32420339
## CAR_AGE URBANICITYz_Highly_Rural/ Rural
## 0.99819493 0.09334636
logitscalar <- mean(dlogis(predict(logit, type = "link")))
logitscalar * coef(logit)
## (Intercept) KIDSDRIV
## -1.158016e-01 9.945167e-02
## AGE HOMEKIDS
## 6.904809e-06 2.206017e-02
## YOJ INCOME
## -1.972543e-03 -5.040064e-07
## PARENT1Yes HOME_VAL
## 4.803969e-02 -1.929523e-07
## MSTATUSz_No SEXz_F
## 7.503592e-02 -1.301990e-02
## EDUCATIONBachelors EDUCATIONMasters
## -5.424472e-02 -4.086324e-02
## EDUCATIONPhD EDUCATIONz_High_School
## -2.181469e-02 3.077558e-03
## JOBClerical JOBDoctor
## 5.811519e-02 -6.163603e-02
## JOBHome_Maker JOBLawyer
## 2.986941e-02 1.709406e-02
## JOBManager JOBProfessional
## -8.188439e-02 2.439650e-02
## JOBStudent JOBz_Blue_Collar
## 2.972047e-02 4.522137e-02
## TRAVTIME CAR_USEPrivate
## 2.163050e-03 -1.108755e-01
## BLUEBOOK TIF
## -3.030737e-06 -4.748519e-02
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 8.311836e-02 8.132789e-02
## CAR_TYPESports_Car CAR_TYPEVan
## 1.502692e-01 8.978260e-02
## CAR_TYPEz_SUV RED_CARyes
## 1.135413e-01 -8.406609e-04
## OLDCLAIM CLM_FREQ
## 9.861172e-04 4.607979e-02
## REVOKEDYes MVR_PTS
## 1.055966e-01 4.094471e-02
## CAR_AGE URBANICITYz_Highly_Rural/ Rural
## -2.634331e-04 -3.457765e-01
confint.default(logit)
## 2.5 % 97.5 %
## (Intercept) -1.439652e+00 -1.487526e-01
## KIDSDRIV 4.659328e-01 8.982056e-01
## AGE -7.944409e-03 8.039120e-03
## HOMEKIDS -1.137668e-02 3.139672e-01
## YOJ -3.034002e-02 3.283437e-03
## INCOME -5.565742e-06 -1.347510e-06
## PARENT1Yes 1.052923e-01 5.536488e-01
## HOME_VAL -1.993393e-06 -6.532553e-07
## MSTATUSz_No 3.481541e-01 6.810835e-01
## SEXz_F -3.088261e-01 1.302374e-01
## EDUCATIONBachelors -5.982429e-01 -1.458101e-01
## EDUCATIONMasters -6.301049e-01 6.960026e-02
## EDUCATIONPhD -5.680126e-01 2.687893e-01
## EDUCATIONz_High_School -1.648411e-01 2.070547e-01
## JOBClerical 1.374718e-02 7.833955e-01
## JOBDoctor -9.444515e-01 9.901495e-02
## JOBHome_Maker -2.064601e-01 6.161667e-01
## JOBLawyer -2.145962e-01 4.490685e-01
## JOBManager -8.971700e-01 -2.260052e-01
## JOBProfessional -1.819185e-01 5.165555e-01
## JOBStudent -2.155551e-01 6.232188e-01
## JOBz_Blue_Collar -5.304572e-02 6.733290e-01
## TRAVTIME 1.114967e-02 1.852002e-02
## CAR_USEPrivate -9.401775e-01 -5.806575e-01
## BLUEBOOK -3.108426e-05 -1.048713e-05
## TIF -4.067786e-01 -2.445566e-01
## CAR_TYPEPanel_Truck 2.538386e-01 8.862624e-01
## CAR_TYPEPickup 3.604241e-01 7.551178e-01
## CAR_TYPESports_Car 7.762439e-01 1.284938e+00
## CAR_TYPEVan 3.680626e-01 8.634492e-01
## CAR_TYPEz_SUV 5.608898e-01 9.965101e-01
## RED_CARyes -1.749296e-01 1.633986e-01
## OLDCLAIM -2.650452e-02 4.003069e-02
## CLM_FREQ 6.565860e-02 5.663993e-01
## REVOKEDYes 5.638194e-01 8.846068e-01
## MVR_PTS 1.984459e-01 3.631762e-01
## CAR_AGE -1.656445e-02 1.295105e-02
## URBANICITYz_Highly_Rural/ Rural -2.592883e+00 -2.149994e+00
predlogit <- predict(logit, type="response")
train2$pred1 <- predict(logit, type="response")
summary(predlogit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002449 0.077438 0.201727 0.263816 0.403524 0.958860
table(true = train$TARGET_FLAG, pred = round(fitted(logit)))
## pred
## true 0 1
## 0 5532 476
## 1 1251 902
#plots for Model 1
par(mfrow=c(2,2))
plot(logit)

data.frame(train2$pred1) %>%
ggplot(aes(x = train2.pred1)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()

plot.roc(train$TARGET_FLAG, train2$pred1)
#extract variables that are significant and rerun model
sigvars <- data.frame(summary(logit)$coef[summary(logit)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
colist<-dplyr::pull(sigvars, vars)
# colist<-colist[2:11]
colist<-c("KIDSDRIV","INCOME","PARENT1","HOME_VAL","MSTATUS","EDUCATION","JOB","TRAVTIME","CAR_USE","BLUEBOOK","TIF","CAR_TYPE","CLM_FREQ","REVOKED","MVR_PTS","URBANICITY")
idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train2['TARGET_FLAG'])
#MODEL 2
logit2 <- glm(TARGET_FLAG ~ ., data=trainmod2, family = "binomial" (link="logit"))
summary(logit2)
##
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"),
## data = trainmod2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5523 -0.7190 -0.3985 0.6497 3.1365
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.728e-01 2.620e-01 -3.332 0.000863 ***
## KIDSDRIV 7.664e-01 9.775e-02 7.841 4.48e-15 ***
## INCOME -3.552e-06 1.071e-06 -3.317 0.000910 ***
## PARENT1Yes 4.476e-01 9.451e-02 4.736 2.18e-06 ***
## HOME_VAL -1.367e-06 3.407e-07 -4.012 6.03e-05 ***
## MSTATUSz_No 4.766e-01 7.969e-02 5.981 2.22e-09 ***
## EDUCATIONBachelors -3.839e-01 1.086e-01 -3.534 0.000409 ***
## EDUCATIONMasters -3.062e-01 1.612e-01 -1.899 0.057514 .
## EDUCATIONPhD -1.761e-01 1.997e-01 -0.882 0.377940
## EDUCATIONz_High_School 1.682e-02 9.450e-02 0.178 0.858752
## JOBClerical 4.011e-01 1.962e-01 2.044 0.040930 *
## JOBDoctor -4.251e-01 2.658e-01 -1.599 0.109770
## JOBHome_Maker 2.561e-01 2.038e-01 1.257 0.208790
## JOBLawyer 1.091e-01 1.690e-01 0.646 0.518557
## JOBManager -5.704e-01 1.711e-01 -3.335 0.000854 ***
## JOBProfessional 1.578e-01 1.781e-01 0.886 0.375433
## JOBStudent 2.732e-01 2.104e-01 1.299 0.194092
## JOBz_Blue_Collar 3.064e-01 1.852e-01 1.654 0.098047 .
## TRAVTIME 1.471e-02 1.877e-03 7.837 4.61e-15 ***
## CAR_USEPrivate -7.623e-01 9.158e-02 -8.324 < 2e-16 ***
## BLUEBOOK -2.321e-05 4.715e-06 -4.922 8.56e-07 ***
## TIF -3.257e-01 4.135e-02 -7.875 3.41e-15 ***
## CAR_TYPEPanel_Truck 6.226e-01 1.505e-01 4.137 3.53e-05 ***
## CAR_TYPEPickup 5.528e-01 1.006e-01 5.497 3.86e-08 ***
## CAR_TYPESports_Car 9.746e-01 1.074e-01 9.077 < 2e-16 ***
## CAR_TYPEVan 6.466e-01 1.220e-01 5.301 1.15e-07 ***
## CAR_TYPEz_SUV 7.218e-01 8.585e-02 8.407 < 2e-16 ***
## CLM_FREQ 3.624e-01 5.464e-02 6.631 3.33e-11 ***
## REVOKEDYes 7.349e-01 8.022e-02 9.161 < 2e-16 ***
## MVR_PTS 2.863e-01 4.138e-02 6.920 4.51e-12 ***
## URBANICITYz_Highly_Rural/ Rural -2.373e+00 1.129e-01 -21.024 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7314.8 on 8130 degrees of freedom
## AIC: 7376.8
##
## Number of Fisher Scoring iterations: 5
exp(logit2$coefficients)
## (Intercept) KIDSDRIV
## 0.41776744 2.15209411
## INCOME PARENT1Yes
## 0.99999645 1.56455092
## HOME_VAL MSTATUSz_No
## 0.99999863 1.61063279
## EDUCATIONBachelors EDUCATIONMasters
## 0.68119924 0.73624403
## EDUCATIONPhD EDUCATIONz_High_School
## 0.83855457 1.01695952
## JOBClerical JOBDoctor
## 1.49341342 0.65370747
## JOBHome_Maker JOBLawyer
## 1.29192357 1.11527301
## JOBManager JOBProfessional
## 0.56527588 1.17097642
## JOBStudent JOBz_Blue_Collar
## 1.31419704 1.35848308
## TRAVTIME CAR_USEPrivate
## 1.01482211 0.46658415
## BLUEBOOK TIF
## 0.99997679 0.72205023
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 1.86373779 1.73806537
## CAR_TYPESports_Car CAR_TYPEVan
## 2.65019303 1.90901065
## CAR_TYPEz_SUV CLM_FREQ
## 2.05810528 1.43671851
## REVOKEDYes MVR_PTS
## 2.08518827 1.33154581
## URBANICITYz_Highly_Rural/ Rural
## 0.09320986
logit2scalar <- mean(dlogis(predict(logit2, type = "link")))
logit2scalar * coef(logit2)
## (Intercept) KIDSDRIV
## -1.274002e-01 1.118714e-01
## INCOME PARENT1Yes
## -5.185070e-07 6.533249e-02
## HOME_VAL MSTATUSz_No
## -1.994968e-07 6.956953e-02
## EDUCATIONBachelors EDUCATIONMasters
## -5.603494e-02 -4.469269e-02
## EDUCATIONPhD EDUCATIONz_High_School
## -2.570038e-02 2.454692e-03
## JOBClerical JOBDoctor
## 5.854023e-02 -6.204783e-02
## JOBHome_Maker JOBLawyer
## 3.738562e-02 1.592436e-02
## JOBManager JOBProfessional
## -8.326286e-02 2.303837e-02
## JOBStudent JOBz_Blue_Collar
## 3.988064e-02 4.471824e-02
## TRAVTIME CAR_USEPrivate
## 2.147590e-03 -1.112694e-01
## BLUEBOOK TIF
## -3.387609e-06 -4.753412e-02
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 9.087371e-02 8.068389e-02
## CAR_TYPESports_Car CAR_TYPEVan
## 1.422595e-01 9.437697e-02
## CAR_TYPEz_SUV CLM_FREQ
## 1.053534e-01 5.289110e-02
## REVOKEDYes MVR_PTS
## 1.072616e-01 4.179488e-02
## URBANICITYz_Highly_Rural/ Rural
## -3.463539e-01
predlogit2 <- predict(logit2, type="response")
train2$pred2 <- predict(logit2, type="response")
summary(predlogit2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002282 0.077191 0.202256 0.263816 0.403691 0.961502
table(true = train$TARGET_FLAG, pred = round(fitted(logit2)))
## pred
## true 0 1
## 0 5541 467
## 1 1247 906
#plots for Model 2
par(mfrow=c(2,2))

plot(logit2)

data.frame(train2$pred2) %>%
ggplot(aes(x = train2.pred2)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()

plot.roc(train$TARGET_FLAG, train2$pred2)
#MODEL 3
#PC Model no racial bias
logit3 <- glm(TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, data=train, family = "binomial" (link="logit"))
summary(logit3)
##
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME,
## family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5299 -0.8217 -0.6749 1.2315 2.8090
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.876e-01 7.305e-02 -9.412 < 2e-16 ***
## KIDSDRIV 7.266e-01 8.115e-02 8.953 < 2e-16 ***
## INCOME -3.497e-06 6.826e-07 -5.123 3.01e-07 ***
## HOME_VAL -2.972e-06 2.499e-07 -11.895 < 2e-16 ***
## TRAVTIME 5.880e-03 1.598e-03 3.679 0.000234 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 9021.1 on 8156 degrees of freedom
## AIC: 9031.1
##
## Number of Fisher Scoring iterations: 4
exp(logit3$coefficients)
## (Intercept) KIDSDRIV INCOME HOME_VAL TRAVTIME
## 0.5028055 2.0679778 0.9999965 0.9999970 1.0058969
predlogit3 <- predict(logit3, type="response")
train2$pred3 <- predict(logit3, type="response")
summary(predlogit3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01176 0.19679 0.25557 0.26382 0.32927 0.68970
table(true = train$TARGET_FLAG, pred = round(fitted(logit3)))
## pred
## true 0 1
## 0 5937 71
## 1 2086 67
#plots for Model 3
par(mfrow=c(2,2))

plot(logit3)

data.frame(train2$pred3) %>%
ggplot(aes(x = train2.pred3)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()

plot.roc(train$TARGET_FLAG, train2$pred3)
logit3scalar <- mean(dlogis(predict(logit3, type = "link")))
logit3scalar * coef(logit3)
## (Intercept) KIDSDRIV INCOME HOME_VAL TRAVTIME
## -1.271908e-01 1.344090e-01 -6.468917e-07 -5.498016e-07 1.087668e-03
round(logitscalar * coef(logit),2)
## (Intercept) KIDSDRIV
## -0.12 0.10
## AGE HOMEKIDS
## 0.00 0.02
## YOJ INCOME
## 0.00 0.00
## PARENT1Yes HOME_VAL
## 0.05 0.00
## MSTATUSz_No SEXz_F
## 0.08 -0.01
## EDUCATIONBachelors EDUCATIONMasters
## -0.05 -0.04
## EDUCATIONPhD EDUCATIONz_High_School
## -0.02 0.00
## JOBClerical JOBDoctor
## 0.06 -0.06
## JOBHome_Maker JOBLawyer
## 0.03 0.02
## JOBManager JOBProfessional
## -0.08 0.02
## JOBStudent JOBz_Blue_Collar
## 0.03 0.05
## TRAVTIME CAR_USEPrivate
## 0.00 -0.11
## BLUEBOOK TIF
## 0.00 -0.05
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 0.08 0.08
## CAR_TYPESports_Car CAR_TYPEVan
## 0.15 0.09
## CAR_TYPEz_SUV RED_CARyes
## 0.11 0.00
## OLDCLAIM CLM_FREQ
## 0.00 0.05
## REVOKEDYes MVR_PTS
## 0.11 0.04
## CAR_AGE URBANICITYz_Highly_Rural/ Rural
## 0.00 -0.35
round(logit2scalar * coef(logit2),2)
## (Intercept) KIDSDRIV
## -0.13 0.11
## INCOME PARENT1Yes
## 0.00 0.07
## HOME_VAL MSTATUSz_No
## 0.00 0.07
## EDUCATIONBachelors EDUCATIONMasters
## -0.06 -0.04
## EDUCATIONPhD EDUCATIONz_High_School
## -0.03 0.00
## JOBClerical JOBDoctor
## 0.06 -0.06
## JOBHome_Maker JOBLawyer
## 0.04 0.02
## JOBManager JOBProfessional
## -0.08 0.02
## JOBStudent JOBz_Blue_Collar
## 0.04 0.04
## TRAVTIME CAR_USEPrivate
## 0.00 -0.11
## BLUEBOOK TIF
## 0.00 -0.05
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 0.09 0.08
## CAR_TYPESports_Car CAR_TYPEVan
## 0.14 0.09
## CAR_TYPEz_SUV CLM_FREQ
## 0.11 0.05
## REVOKEDYes MVR_PTS
## 0.11 0.04
## URBANICITYz_Highly_Rural/ Rural
## -0.35
round(logit3scalar * coef(logit3),2)
## (Intercept) KIDSDRIV INCOME HOME_VAL TRAVTIME
## -0.13 0.13 0.00 0.00 0.00

Build Models GENERAL TARGET_AMT
#MODEL 1
model <- lm(TARGET_AMT ~ ., data=train)
summary(model)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6234 -465 -58 243 101178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.975e+02 5.010e+02 -1.193 0.2331
## TARGET_FLAG1 5.707e+03 1.134e+02 50.329 < 2e-16 ***
## KIDSDRIV -2.216e+01 1.781e+02 -0.124 0.9010
## AGE 6.145e+00 6.271e+00 0.980 0.3272
## HOMEKIDS 9.215e+01 1.256e+02 0.733 0.4633
## YOJ 7.685e+00 1.319e+01 0.583 0.5601
## INCOME -2.258e-03 1.577e-03 -1.431 0.1524
## PARENT1Yes 1.209e+02 1.830e+02 0.661 0.5088
## HOME_VAL 3.864e-04 5.165e-04 0.748 0.4545
## MSTATUSz_No 1.770e+02 1.282e+02 1.381 0.1673
## SEXz_F -2.896e+02 1.606e+02 -1.804 0.0713 .
## EDUCATIONBachelors 6.823e+01 1.790e+02 0.381 0.7031
## EDUCATIONMasters 2.235e+02 2.620e+02 0.853 0.3937
## EDUCATIONPhD 4.283e+02 3.110e+02 1.377 0.1685
## EDUCATIONz_High_School -1.243e+02 1.502e+02 -0.828 0.4077
## JOBClerical -8.406e+00 2.984e+02 -0.028 0.9775
## JOBDoctor -2.812e+02 3.571e+02 -0.788 0.4310
## JOBHome_Maker -7.045e+01 3.185e+02 -0.221 0.8249
## JOBLawyer 7.660e+01 2.582e+02 0.297 0.7667
## JOBManager -1.265e+02 2.521e+02 -0.502 0.6158
## JOBProfessional 1.733e+02 2.698e+02 0.642 0.5206
## JOBStudent -1.306e+02 3.266e+02 -0.400 0.6892
## JOBz_Blue_Collar 5.187e+01 2.813e+02 0.184 0.8537
## TRAVTIME 5.682e-01 2.824e+00 0.201 0.8405
## CAR_USEPrivate -9.993e+01 1.443e+02 -0.693 0.4886
## BLUEBOOK 2.944e-02 7.536e-03 3.906 9.45e-05 ***
## TIF -1.653e+01 6.277e+01 -0.263 0.7922
## CAR_TYPEPanel_Truck -5.880e+01 2.430e+02 -0.242 0.8088
## CAR_TYPEPickup -3.318e+01 1.493e+02 -0.222 0.8241
## CAR_TYPESports_Car 2.098e+02 1.910e+02 1.099 0.2720
## CAR_TYPEVan 9.709e+01 1.865e+02 0.521 0.6026
## CAR_TYPEz_SUV 1.621e+02 1.571e+02 1.032 0.3021
## RED_CARyes -2.696e+01 1.302e+02 -0.207 0.8360
## OLDCLAIM 4.079e+00 2.908e+01 0.140 0.8884
## CLM_FREQ -8.551e+01 2.210e+02 -0.387 0.6989
## REVOKEDYes -2.991e+02 1.385e+02 -2.160 0.0308 *
## MVR_PTS 1.396e+02 6.716e+01 2.079 0.0376 *
## CAR_AGE -2.520e+01 1.118e+01 -2.254 0.0242 *
## URBANICITYz_Highly_Rural/ Rural 2.987e+01 1.272e+02 0.235 0.8143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3970 on 8122 degrees of freedom
## Multiple R-squared: 0.2912, Adjusted R-squared: 0.2879
## F-statistic: 87.8 on 38 and 8122 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(model$residuals ~ model$fitted.values)
plot(model$fitted.values,train$TARGET_AMT)

par(mfrow=c(2,2))
plot(model)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(model)$coef[summary(model)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
colist<-dplyr::pull(sigvars, vars)
colist<-c("TARGET_FLAG","BLUEBOOK","REVOKED","MVR_PTS","CAR_AGE")
idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train['TARGET_AMT'])
#MODEL 2
model2<-lm(TARGET_AMT ~ ., data=trainmod2)
summary(model2)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = trainmod2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6269 -378 -34 192 101505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.315e+02 1.206e+02 -3.579 0.000347 ***
## TARGET_FLAG1 5.735e+03 1.036e+02 55.334 < 2e-16 ***
## BLUEBOOK 3.010e-02 5.328e-03 5.649 1.67e-08 ***
## REVOKEDYes -2.874e+02 1.356e+02 -2.120 0.034021 *
## MVR_PTS 1.309e+02 6.101e+01 2.145 0.031986 *
## CAR_AGE -1.291e+01 8.122e+00 -1.590 0.111894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3968 on 8155 degrees of freedom
## Multiple R-squared: 0.289, Adjusted R-squared: 0.2886
## F-statistic: 662.9 on 5 and 8155 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model2$residuals ~ model2$fitted.values)
plot(model2$fitted.values,train$TARGET_AMT)
par(mfrow=c(2,2))

plot(model2)

par(mfrow=c(1,2))
plot(model2$residuals ~ model2$fitted.values, main="New Reduced Var Model")
abline(h = 0)
plot(model$residuals ~ model$fitted.values, main="Orignal Model All Vars")
abline(h = 0)

#MODEL 3
#remove variables with opposite coefficients
model3<-lm(TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, data=train)
summary(model3)
##
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3610 -1652 -1239 -318 106277
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.680e+03 1.470e+02 11.426 < 2e-16 ***
## KIDSDRIV 9.172e+02 1.789e+02 5.126 3.03e-07 ***
## INCOME -1.242e-03 1.336e-03 -0.930 0.3522
## HOME_VAL -2.809e-03 4.920e-04 -5.710 1.17e-08 ***
## TRAVTIME 7.234e+00 3.260e+00 2.219 0.0265 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4679 on 8156 degrees of freedom
## Multiple R-squared: 0.01096, Adjusted R-squared: 0.01047
## F-statistic: 22.59 on 4 and 8156 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(model3$residuals ~ model3$fitted.values)
plot(model3$fitted.values,train$TARGET_AMT)

par(mfrow=c(2,2))
plot(model3)

Select Models
test = read.csv(file="data/insurance-evaluation-data.csv")
test2<- test
dim(test)
## [1] 2141 26
test$TARGET_AMT <- 0
test$TARGET_FLAG <- 0
test = as.tbl(test) %>%
mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
currencyconv) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
underscore) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
as.factor) %>%
mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
# impute data for missing values
# use column mean for calculation
test$HOMEKIDS <- log(test$HOMEKIDS+1)
test$MVR_PTS <- log(test$MVR_PTS+1)
test$OLDCLAIM <- log(test$OLDCLAIM+1)
test$TIF <- log(test$TIF+1)
test$KIDSDRIV <- log(test$KIDSDRIV+1)
test$CLM_FREQ <- log(test$CLM_FREQ+1)
# use column mean for calculation
test$AGE[is.na(test$AGE)] <- mean(test$AGE, na.rm=TRUE)
test$YOJ[is.na(test$YOJ)] <- mean(test$YOJ, na.rm=TRUE)
test$HOME_VAL[is.na(test$HOME_VAL)] <- mean(test$HOME_VAL, na.rm=TRUE)
test$CAR_AGE[is.na(test$CAR_AGE)] <- mean(test$CAR_AGE, na.rm=TRUE)
test$INCOME[is.na(test$INCOME)] <- mean(test$INCOME, na.rm=TRUE)
#get complete cases
#remove rad per correlation in prior section
test <- test[, !(colnames(test) %in% c("INDEX"))]
TARGET_FLAG <- predict(logit, newdata = test, type="response")
y_pred_num <- ifelse(TARGET_FLAG > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
summary(y_pred)
## 0 1
## 1776 365
rbind(round(summary(predlogit),4), round(summary(TARGET_FLAG),4)) %>% kable()
Min.
|
1st Qu.
|
Median
|
Mean
|
3rd Qu.
|
Max.
|
0.0024
|
0.0774
|
0.2017
|
0.2638
|
0.4035
|
0.9589
|
0.0031
|
0.0777
|
0.2183
|
0.2708
|
0.4102
|
0.9464
|
test$TARGET_FLAG <- as.factor(test$TARGET_FLAG)
test2 <- test[, !(colnames(test) %in% c("TARGET_FLAG"))]
TARGET_AMT<- predict(model, newdata = test, interval='confidence') #data from scaling originally to get to actual wins
summary(TARGET_AMT)
## fit lwr upr
## Min. :-1206.170 Min. :-1870.4 Min. :-542.0
## 1st Qu.: -255.615 1st Qu.: -782.6 1st Qu.: 256.4
## Median : -22.708 Median : -538.1 Median : 478.1
## Mean : -8.173 Mean : -540.5 Mean : 524.1
## 3rd Qu.: 223.762 3rd Qu.: -303.8 3rd Qu.: 774.3
## Max. : 1251.287 Max. : 521.4 Max. :1998.7
summary(model)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6234 -465 -58 243 101178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.975e+02 5.010e+02 -1.193 0.2331
## TARGET_FLAG1 5.707e+03 1.134e+02 50.329 < 2e-16 ***
## KIDSDRIV -2.216e+01 1.781e+02 -0.124 0.9010
## AGE 6.145e+00 6.271e+00 0.980 0.3272
## HOMEKIDS 9.215e+01 1.256e+02 0.733 0.4633
## YOJ 7.685e+00 1.319e+01 0.583 0.5601
## INCOME -2.258e-03 1.577e-03 -1.431 0.1524
## PARENT1Yes 1.209e+02 1.830e+02 0.661 0.5088
## HOME_VAL 3.864e-04 5.165e-04 0.748 0.4545
## MSTATUSz_No 1.770e+02 1.282e+02 1.381 0.1673
## SEXz_F -2.896e+02 1.606e+02 -1.804 0.0713 .
## EDUCATIONBachelors 6.823e+01 1.790e+02 0.381 0.7031
## EDUCATIONMasters 2.235e+02 2.620e+02 0.853 0.3937
## EDUCATIONPhD 4.283e+02 3.110e+02 1.377 0.1685
## EDUCATIONz_High_School -1.243e+02 1.502e+02 -0.828 0.4077
## JOBClerical -8.406e+00 2.984e+02 -0.028 0.9775
## JOBDoctor -2.812e+02 3.571e+02 -0.788 0.4310
## JOBHome_Maker -7.045e+01 3.185e+02 -0.221 0.8249
## JOBLawyer 7.660e+01 2.582e+02 0.297 0.7667
## JOBManager -1.265e+02 2.521e+02 -0.502 0.6158
## JOBProfessional 1.733e+02 2.698e+02 0.642 0.5206
## JOBStudent -1.306e+02 3.266e+02 -0.400 0.6892
## JOBz_Blue_Collar 5.187e+01 2.813e+02 0.184 0.8537
## TRAVTIME 5.682e-01 2.824e+00 0.201 0.8405
## CAR_USEPrivate -9.993e+01 1.443e+02 -0.693 0.4886
## BLUEBOOK 2.944e-02 7.536e-03 3.906 9.45e-05 ***
## TIF -1.653e+01 6.277e+01 -0.263 0.7922
## CAR_TYPEPanel_Truck -5.880e+01 2.430e+02 -0.242 0.8088
## CAR_TYPEPickup -3.318e+01 1.493e+02 -0.222 0.8241
## CAR_TYPESports_Car 2.098e+02 1.910e+02 1.099 0.2720
## CAR_TYPEVan 9.709e+01 1.865e+02 0.521 0.6026
## CAR_TYPEz_SUV 1.621e+02 1.571e+02 1.032 0.3021
## RED_CARyes -2.696e+01 1.302e+02 -0.207 0.8360
## OLDCLAIM 4.079e+00 2.908e+01 0.140 0.8884
## CLM_FREQ -8.551e+01 2.210e+02 -0.387 0.6989
## REVOKEDYes -2.991e+02 1.385e+02 -2.160 0.0308 *
## MVR_PTS 1.396e+02 6.716e+01 2.079 0.0376 *
## CAR_AGE -2.520e+01 1.118e+01 -2.254 0.0242 *
## URBANICITYz_Highly_Rural/ Rural 2.987e+01 1.272e+02 0.235 0.8143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3970 on 8122 degrees of freedom
## Multiple R-squared: 0.2912, Adjusted R-squared: 0.2879
## F-statistic: 87.8 on 38 and 8122 DF, p-value: < 2.2e-16