The training dataset contains 8161 rows and 26 columns. The data types are of integer, numeric and character. The column values are mixed of continuous, discrete, categorical, and binary. There are 23 predictors, and 2 response variables related to a customer at an auto insurance company. The training dataset contains about 27% classifications of a person in a car crash and the corresponding amount of money it costs.
A summary and boxplots of all variables suggest some columns have incorrect formats, some have missing data, and some may contain outliers.
library(ggcorrplot)
library(car)
library(MASS)
library(dplyr)
library(ggplot2)
library(caret)
library(pROC)
library(pscl)
library(psych)
library(data.table)
# Read the training data
train_df <- read.csv("https://raw.githubusercontent.com/L-Velasco/DATA621_FA18/master/HW4/insurance_training_data.csv", stringsAsFactors = FALSE)
dim(train_df)
## [1] 8161 26
str(train_df)
## 'data.frame': 8161 obs. of 26 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : int 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : int 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : chr "$67,349" "$91,449" "$16,039" "" ...
## $ PARENT1 : chr "No" "No" "No" "No" ...
## $ HOME_VAL : chr "$0" "$257,252" "$124,191" "$306,251" ...
## $ MSTATUS : chr "z_No" "z_No" "Yes" "Yes" ...
## $ SEX : chr "M" "M" "z_F" "M" ...
## $ EDUCATION : chr "PhD" "z_High School" "z_High School" "<High School" ...
## $ JOB : chr "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : chr "Private" "Commercial" "Private" "Private" ...
## $ BLUEBOOK : chr "$14,230" "$14,940" "$4,010" "$15,440" ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : chr "Minivan" "Minivan" "z_SUV" "Minivan" ...
## $ RED_CAR : chr "yes" "yes" "no" "yes" ...
## $ OLDCLAIM : chr "$4,461" "$0" "$38,690" "$0" ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : chr "No" "No" "No" "No" ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : chr "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
Displaying first few rows of the dataset
head(train_df)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1 1 0 0 0 60 0 11 $67,349 No
## 2 2 0 0 0 43 0 11 $91,449 No
## 3 4 0 0 0 35 1 10 $16,039 No
## 4 5 0 0 0 51 0 14 No
## 5 6 0 0 0 50 0 NA $114,986 No
## 6 7 1 2946 0 34 1 12 $125,301 Yes
## HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE
## 1 $0 z_No M PhD Professional 14 Private
## 2 $257,252 z_No M z_High School z_Blue Collar 22 Commercial
## 3 $124,191 Yes z_F z_High School Clerical 5 Private
## 4 $306,251 Yes M <High School z_Blue Collar 32 Private
## 5 $243,925 Yes z_F PhD Doctor 36 Private
## 6 $0 z_No z_F Bachelors z_Blue Collar 46 Commercial
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1 $14,230 11 Minivan yes $4,461 2 No 3
## 2 $14,940 1 Minivan yes $0 0 No 0
## 3 $4,010 4 z_SUV no $38,690 2 No 3
## 4 $15,440 7 Minivan yes $0 0 No 0
## 5 $18,000 1 z_SUV no $19,217 2 Yes 3
## 6 $17,430 1 Sports Car no $0 0 No 0
## CAR_AGE URBANICITY
## 1 18 Highly Urban/ Urban
## 2 1 Highly Urban/ Urban
## 3 10 Highly Urban/ Urban
## 4 6 Highly Urban/ Urban
## 5 17 Highly Urban/ Urban
## 6 7 Highly Urban/ Urban
Some columns need to be removed, converted, and cleaned prior to providing some meaningful description of the data.
# Exclude the INDEX column
tr <- train_df[-1]
# Convert to numeric
tr$INCOME <- as.numeric(gsub('[$,]', '', tr$INCOME))
tr$HOME_VAL <- as.numeric(gsub('[$,]', '', tr$HOME_VAL))
tr$BLUEBOOK <- as.numeric(gsub('[$,]', '', tr$BLUEBOOK))
tr$OLDCLAIM <- as.numeric(gsub('[$,]', '', tr$OLDCLAIM))
# Remove irrelevant characters
tr$MSTATUS <- gsub("z_", "", tr$MSTATUS)
tr$SEX <- gsub("z_", "", tr$SEX)
tr$EDUCATION <- gsub("z_", "", tr$EDUCATION)
tr$JOB <- gsub("z_", "", tr$JOB)
tr$CAR_USE <- gsub("z_", "", tr$CAR_USE)
tr$CAR_TYPE <- gsub("z_", "", tr$CAR_TYPE)
tr$URBANICITY <- gsub("z_", "", tr$URBANICITY)
# Reorder columns -- predictor categorical, predictor numeric, target
indx <- c(8, 10:13, 15, 18:19, 22, 25, 3:7, 9, 14, 16:17, 20:21, 23:24, 1:2)
tr_ordered <- tr
setcolorder(tr_ordered,indx)
## categorical 1:10 / numeric 11:23 / response 24:25
table(tr$PARENT1)
##
## No Yes
## 7084 1077
table(tr$MSTATUS)
##
## No Yes
## 3267 4894
table(tr$SEX)
##
## F M
## 4375 3786
table(tr$EDUCATION)
##
## <High School Bachelors High School Masters PhD
## 1203 2242 2330 1658 728
table(tr$JOB)
##
## Blue Collar Clerical Doctor Home Maker
## 526 1825 1271 246 641
## Lawyer Manager Professional Student
## 835 988 1117 712
table(tr$CAR_USE)
##
## Commercial Private
## 3029 5132
table(tr$CAR_TYPE)
##
## Minivan Panel Truck Pickup Sports Car SUV Van
## 2145 676 1389 907 2294 750
table(tr$RED_CAR)
##
## no yes
## 5783 2378
table(tr$REVOKED)
##
## No Yes
## 7161 1000
table(tr$URBANICITY)
##
## Highly Rural/ Rural Highly Urban/ Urban
## 1669 6492
## KIDSDRIV AGE HOMEKIDS YOJ
## Min. :0.0000 Min. :16.00 Min. :0.0000 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.0
## Median :0.0000 Median :45.00 Median :0.0000 Median :11.0
## Mean :0.1711 Mean :44.79 Mean :0.7212 Mean :10.5
## 3rd Qu.:0.0000 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.0
## Max. :4.0000 Max. :81.00 Max. :5.0000 Max. :23.0
## NA's :6 NA's :454
## INCOME HOME_VAL TRAVTIME BLUEBOOK
## Min. : 0 Min. : 0 Min. : 5.00 Min. : 1500
## 1st Qu.: 28097 1st Qu.: 0 1st Qu.: 22.00 1st Qu.: 9280
## Median : 54028 Median :161160 Median : 33.00 Median :14440
## Mean : 61898 Mean :154867 Mean : 33.49 Mean :15710
## 3rd Qu.: 85986 3rd Qu.:238724 3rd Qu.: 44.00 3rd Qu.:20850
## Max. :367030 Max. :885282 Max. :142.00 Max. :69740
## NA's :445 NA's :464
## TIF OLDCLAIM CLM_FREQ MVR_PTS
## Min. : 1.000 Min. : 0 Min. :0.0000 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 0 1st Qu.:0.0000 1st Qu.: 0.000
## Median : 4.000 Median : 0 Median :0.0000 Median : 1.000
## Mean : 5.351 Mean : 4037 Mean :0.7986 Mean : 1.696
## 3rd Qu.: 7.000 3rd Qu.: 4636 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :25.000 Max. :57037 Max. :5.0000 Max. :13.000
##
## CAR_AGE
## Min. :-3.000
## 1st Qu.: 1.000
## Median : 8.000
## Mean : 8.328
## 3rd Qu.:12.000
## Max. :28.000
## NA's :510
Summary plot of numeric predictors against target flag response (0=Car was not in a crash, 1=Car was in a crash)
To prepare the data for modeling, variables will be added, removed, imputed, and treated for outliers.
tr_prep <- tr_ordered
Columns with Missing Data
M <- sapply(tr_prep, function(x) sum(x=="") | sum(is.na(x))); names(M[(M>0)])
## [1] "JOB" "AGE" "YOJ" "INCOME" "HOME_VAL" "CAR_AGE"
For the numeric variables with missing data below, they will be populated with the variable’s mean or median (if skewed).
#colnames(tr_prep)[colSums(is.na(tr_prep)) > 0]
x <- c(12, 14, 15, 16, 23)
par(mfrow=c(2,3))
for (val in x) {
hist(tr_prep[,val],xlab=names(tr_prep[val]), main="")
}
par(mfrow=c(1,1))
For missing categorical data (JOB), it will be imputed based on the most common value based on Education.
#impute
tr_prep = tr_prep %>%
mutate(AGE =
ifelse(is.na(AGE),
mean(AGE, na.rm=TRUE), AGE)) %>%
mutate(YOJ =
ifelse(is.na(YOJ),
mean(YOJ, na.rm=TRUE), YOJ)) %>%
mutate(INCOME =
ifelse(is.na(INCOME),
median(INCOME, na.rm=TRUE), INCOME)) %>%
mutate(HOME_VAL =
ifelse(is.na(HOME_VAL),
mean(HOME_VAL, na.rm=TRUE), HOME_VAL)) %>%
mutate(CAR_AGE =
ifelse(is.na(CAR_AGE),
mean(CAR_AGE, na.rm=TRUE), CAR_AGE)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'PhD'),
"Doctor", JOB)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'Masters'),
"Lawyer", JOB))
Check that no columns has missing data
M <- sapply(tr_prep, function(x) sum(x=="") | sum(is.na(x))); names(M[(M>0)])
## character(0)
Any outliers outside of lower 1.5IQR would be capped at 5th %ile, and observations above the upper 1.5IQR would be capped at 95th %ile.
# Outlier Capping
tr_prep2 <- tr_prep
id <- c(11:23)
for (val in id) {
qnt <- quantile(tr_prep2[,val], probs=c(.25, .75), na.rm = T)
caps <- quantile(tr_prep2[,val], probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(tr_prep2[,val], na.rm = T)
tr_prep2[,val][tr_prep2[,val] < (qnt[1] - H)] <- caps[1]
tr_prep2[,val][tr_prep2[,val] > (qnt[2] + H)] <- caps[2]
}
Each model will have its logistics and multiple linear regression version.
For logistic regression, the training data set will be split such that 80% of the observations will be used to train the model and 20% will be used to test the model to derive performance metrics.
nTrain <- createDataPartition(tr_prep2$TARGET_FLAG, p=0.8, list=FALSE)
ntraining <- tr_prep2[nTrain,]
ntesting <- tr_prep2[-nTrain,]
set.seed(123)
# Logistic Regression build the model using training set
full.model_FLAG <- glm(TARGET_FLAG ~.-TARGET_AMT, data = ntraining , family = binomial)
summary(full.model_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial,
## data = ntraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2073 -0.7169 -0.3970 0.6696 3.1598
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.532e+00 3.195e-01 -7.927 2.25e-15 ***
## PARENT1Yes 2.931e-01 1.246e-01 2.353 0.018604 *
## MSTATUSYes -4.692e-01 9.432e-02 -4.974 6.55e-07 ***
## SEXM 9.600e-02 1.247e-01 0.770 0.441300
## EDUCATIONBachelors -3.992e-01 1.295e-01 -3.082 0.002056 **
## EDUCATIONHigh School 2.464e-02 1.052e-01 0.234 0.814770
## EDUCATIONMasters -3.847e-01 2.052e-01 -1.875 0.060780 .
## EDUCATIONPhD -1.230e-01 2.576e-01 -0.477 0.633054
## JOBClerical 1.055e-01 1.186e-01 0.890 0.373675
## JOBDoctor -7.330e-01 2.793e-01 -2.624 0.008682 **
## JOBHome Maker -1.202e-01 1.698e-01 -0.708 0.479005
## JOBLawyer -1.293e-01 1.952e-01 -0.662 0.507894
## JOBManager -7.869e-01 1.533e-01 -5.133 2.85e-07 ***
## JOBProfessional -1.201e-01 1.341e-01 -0.895 0.370688
## JOBStudent -1.050e-01 1.450e-01 -0.724 0.469077
## CAR_USEPrivate -7.674e-01 9.742e-02 -7.877 3.36e-15 ***
## CAR_TYPEPanel Truck 5.309e-01 1.759e-01 3.019 0.002539 **
## CAR_TYPEPickup 5.769e-01 1.121e-01 5.146 2.67e-07 ***
## CAR_TYPESports Car 1.045e+00 1.446e-01 7.227 4.95e-13 ***
## CAR_TYPESUV 7.752e-01 1.245e-01 6.224 4.86e-10 ***
## CAR_TYPEVan 6.807e-01 1.400e-01 4.864 1.15e-06 ***
## RED_CARyes -2.540e-02 9.669e-02 -0.263 0.792779
## REVOKEDYes 9.430e-01 1.016e-01 9.284 < 2e-16 ***
## URBANICITYHighly Urban/ Urban 2.390e+00 1.252e-01 19.094 < 2e-16 ***
## KIDSDRIV 6.144e-01 1.091e-01 5.629 1.81e-08 ***
## AGE -5.143e-03 4.529e-03 -1.136 0.256101
## HOMEKIDS 5.039e-02 4.463e-02 1.129 0.258872
## YOJ -6.208e-03 9.527e-03 -0.652 0.514621
## INCOME -4.383e-06 1.369e-06 -3.202 0.001364 **
## HOME_VAL -1.403e-06 3.877e-07 -3.619 0.000296 ***
## TRAVTIME 1.573e-02 2.174e-03 7.238 4.54e-13 ***
## BLUEBOOK -1.626e-05 6.035e-06 -2.694 0.007062 **
## TIF -6.038e-02 8.589e-03 -7.030 2.07e-12 ***
## OLDCLAIM -1.761e-05 5.318e-06 -3.312 0.000928 ***
## CLM_FREQ 2.193e-01 3.268e-02 6.712 1.92e-11 ***
## MVR_PTS 1.116e-01 1.640e-02 6.807 9.94e-12 ***
## CAR_AGE 2.768e-03 8.500e-03 0.326 0.744733
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7568.3 on 6528 degrees of freedom
## Residual deviance: 5859.8 on 6492 degrees of freedom
## AIC: 5933.8
##
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(full.model_FLAG))),2)
## Estimate
## (Intercept) 0.08
## PARENT1Yes 1.34
## MSTATUSYes 0.63
## SEXM 1.10
## EDUCATIONBachelors 0.67
## EDUCATIONHigh School 1.02
## EDUCATIONMasters 0.68
## EDUCATIONPhD 0.88
## JOBClerical 1.11
## JOBDoctor 0.48
## JOBHome Maker 0.89
## JOBLawyer 0.88
## JOBManager 0.46
## JOBProfessional 0.89
## JOBStudent 0.90
## CAR_USEPrivate 0.46
## CAR_TYPEPanel Truck 1.70
## CAR_TYPEPickup 1.78
## CAR_TYPESports Car 2.84
## CAR_TYPESUV 2.17
## CAR_TYPEVan 1.98
## RED_CARyes 0.97
## REVOKEDYes 2.57
## URBANICITYHighly Urban/ Urban 10.92
## KIDSDRIV 1.85
## AGE 0.99
## HOMEKIDS 1.05
## YOJ 0.99
## INCOME 1.00
## HOME_VAL 1.00
## TRAVTIME 1.02
## BLUEBOOK 1.00
## TIF 0.94
## OLDCLAIM 1.00
## CLM_FREQ 1.25
## MVR_PTS 1.12
## CAR_AGE 1.00
# evaluate the model by predicting using the testing set
m1_prob <- predict(full.model_FLAG, ntesting, type = "response")
m1_pclass <- ifelse(m1_prob >= 0.5, 1, 0)
# create confusion matrix
pclass <- factor(m1_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 171 92
## 0 243 1126
##
## Accuracy : 0.7947
## 95% CI : (0.7743, 0.8141)
## No Information Rate : 0.7463
## P-Value [Acc > NIR] : 2.504e-06
##
## Kappa : 0.3837
## Mcnemar's Test P-Value : 2.498e-16
##
## Sensitivity : 0.4130
## Specificity : 0.9245
## Pos Pred Value : 0.6502
## Neg Pred Value : 0.8225
## Prevalence : 0.2537
## Detection Rate : 0.1048
## Detection Prevalence : 0.1612
## Balanced Accuracy : 0.6688
##
## 'Positive' Class : 1
##
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m1_prob),print.auc=TRUE)
# get McFadden
m1_mcFadden <- pR2(full.model_FLAG); m1_mcFadden["McFadden"]
## McFadden
## 0.2257429
## --------------------------------
# Linear Regression - TARGET_AMT
full.model.AMT <- lm(TARGET_AMT ~. -TARGET_FLAG, data = tr_prep2)
summary(full.model.AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ . - TARGET_FLAG, data = tr_prep2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5169 -1704 -754 344 103686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.821e+01 4.779e+02 0.205 0.837204
## PARENT1Yes 5.364e+02 2.043e+02 2.626 0.008654 **
## MSTATUSYes -5.759e+02 1.453e+02 -3.963 7.47e-05 ***
## SEXM 3.471e+02 1.833e+02 1.894 0.058294 .
## EDUCATIONBachelors -2.659e+02 2.055e+02 -1.294 0.195735
## EDUCATIONHigh School -9.582e+01 1.716e+02 -0.558 0.576571
## EDUCATIONMasters -4.219e+01 3.061e+02 -0.138 0.890376
## EDUCATIONPhD 3.898e+02 3.757e+02 1.038 0.299462
## JOBClerical 1.773e+01 1.917e+02 0.093 0.926295
## JOBDoctor -1.046e+03 4.009e+02 -2.608 0.009117 **
## JOBHome Maker -1.745e+02 2.684e+02 -0.650 0.515478
## JOBLawyer -2.484e+02 2.928e+02 -0.848 0.396242
## JOBManager -9.855e+02 2.328e+02 -4.233 2.33e-05 ***
## JOBProfessional -2.882e+01 2.117e+02 -0.136 0.891724
## JOBStudent -2.275e+02 2.358e+02 -0.965 0.334779
## CAR_USEPrivate -8.000e+02 1.578e+02 -5.071 4.05e-07 ***
## CAR_TYPEPanel Truck 2.668e+02 2.724e+02 0.979 0.327500
## CAR_TYPEPickup 3.707e+02 1.706e+02 2.172 0.029869 *
## CAR_TYPESports Car 1.014e+03 2.178e+02 4.656 3.28e-06 ***
## CAR_TYPESUV 7.434e+02 1.793e+02 4.145 3.43e-05 ***
## CAR_TYPEVan 5.220e+02 2.127e+02 2.454 0.014162 *
## RED_CARyes -3.741e+01 1.491e+02 -0.251 0.801877
## REVOKEDYes 5.906e+02 1.743e+02 3.387 0.000709 ***
## URBANICITYHighly Urban/ Urban 1.675e+03 1.392e+02 12.032 < 2e-16 ***
## KIDSDRIV 6.104e+02 1.789e+02 3.411 0.000650 ***
## AGE 6.643e+00 7.158e+00 0.928 0.353395
## HOMEKIDS 8.003e+01 6.981e+01 1.146 0.251677
## YOJ -5.048e+00 1.503e+01 -0.336 0.737007
## INCOME -4.652e-03 2.081e-03 -2.236 0.025408 *
## HOME_VAL -6.471e-04 5.961e-04 -1.085 0.277733
## TRAVTIME 1.285e+01 3.340e+00 3.847 0.000120 ***
## BLUEBOOK 1.357e-02 8.954e-03 1.515 0.129769
## TIF -5.120e+01 1.294e+01 -3.958 7.62e-05 ***
## OLDCLAIM -1.685e-02 9.119e-03 -1.848 0.064627 .
## CLM_FREQ 1.667e+02 5.686e+01 2.932 0.003381 **
## MVR_PTS 1.731e+02 2.790e+01 6.203 5.79e-10 ***
## CAR_AGE -2.708e+01 1.286e+01 -2.105 0.035304 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4544 on 8124 degrees of freedom
## Multiple R-squared: 0.0709, Adjusted R-squared: 0.06679
## F-statistic: 17.22 on 36 and 8124 DF, p-value: < 2.2e-16
vif(full.model.AMT)
## GVIF Df GVIF^(1/(2*Df))
## PARENT1 1.888802 1 1.374337
## MSTATUS 2.003847 1 1.415573
## SEX 3.302562 1 1.817295
## EDUCATION 15.755173 4 1.411490
## JOB 29.731384 7 1.274177
## CAR_USE 2.296181 1 1.515316
## CAR_TYPE 5.324276 5 1.182023
## RED_CAR 1.813330 1 1.346599
## REVOKED 1.291470 1 1.136429
## URBANICITY 1.245363 1 1.115958
## KIDSDRIV 1.338024 1 1.156730
## AGE 1.462748 1 1.209441
## HOMEKIDS 2.140216 1 1.462948
## YOJ 1.416953 1 1.190358
## INCOME 2.863846 1 1.692290
## HOME_VAL 2.143594 1 1.464102
## TRAVTIME 1.034456 1 1.017082
## BLUEBOOK 2.032225 1 1.425561
## TIF 1.006054 1 1.003022
## OLDCLAIM 1.827781 1 1.351954
## CLM_FREQ 1.714439 1 1.309366
## MVR_PTS 1.229565 1 1.108858
## CAR_AGE 1.975294 1 1.405451
par(mfrow=c(2,2))
plot(full.model.AMT)
par(mfrow=c(1,1))
# Logistic Regression build the model using training set
step.model_FLAG <- full.model_FLAG %>% stepAIC(trace = FALSE)
summary(step.model_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ PARENT1 + MSTATUS + EDUCATION + JOB +
## CAR_USE + CAR_TYPE + REVOKED + URBANICITY + KIDSDRIV + AGE +
## INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + TIF + OLDCLAIM +
## CLM_FREQ + MVR_PTS, family = binomial, data = ntraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2373 -0.7173 -0.3962 0.6727 3.1660
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.413e+00 2.874e-01 -8.397 < 2e-16 ***
## PARENT1Yes 3.498e-01 1.114e-01 3.140 0.001692 **
## MSTATUSYes -4.482e-01 8.998e-02 -4.981 6.32e-07 ***
## EDUCATIONBachelors -3.871e-01 1.217e-01 -3.181 0.001467 **
## EDUCATIONHigh School 2.544e-02 1.048e-01 0.243 0.808155
## EDUCATIONMasters -3.584e-01 1.854e-01 -1.933 0.053188 .
## EDUCATIONPhD -9.788e-02 2.432e-01 -0.402 0.687353
## JOBClerical 1.070e-01 1.184e-01 0.904 0.366156
## JOBDoctor -7.340e-01 2.792e-01 -2.629 0.008553 **
## JOBHome Maker -1.020e-01 1.607e-01 -0.635 0.525620
## JOBLawyer -1.293e-01 1.952e-01 -0.663 0.507617
## JOBManager -7.891e-01 1.532e-01 -5.150 2.61e-07 ***
## JOBProfessional -1.255e-01 1.340e-01 -0.937 0.348867
## JOBStudent -7.319e-02 1.386e-01 -0.528 0.597453
## CAR_USEPrivate -7.698e-01 9.730e-02 -7.911 2.55e-15 ***
## CAR_TYPEPanel Truck 5.756e-01 1.653e-01 3.482 0.000497 ***
## CAR_TYPEPickup 5.729e-01 1.120e-01 5.117 3.10e-07 ***
## CAR_TYPESports Car 9.912e-01 1.201e-01 8.254 < 2e-16 ***
## CAR_TYPESUV 7.228e-01 9.668e-02 7.476 7.64e-14 ***
## CAR_TYPEVan 7.074e-01 1.351e-01 5.236 1.64e-07 ***
## REVOKEDYes 9.462e-01 1.015e-01 9.323 < 2e-16 ***
## URBANICITYHighly Urban/ Urban 2.389e+00 1.252e-01 19.089 < 2e-16 ***
## KIDSDRIV 6.648e-01 9.828e-02 6.765 1.34e-11 ***
## AGE -6.861e-03 4.149e-03 -1.654 0.098213 .
## INCOME -4.437e-06 1.361e-06 -3.259 0.001117 **
## HOME_VAL -1.415e-06 3.873e-07 -3.653 0.000259 ***
## TRAVTIME 1.572e-02 2.172e-03 7.235 4.67e-13 ***
## BLUEBOOK -1.826e-05 5.464e-06 -3.343 0.000830 ***
## TIF -6.027e-02 8.585e-03 -7.020 2.21e-12 ***
## OLDCLAIM -1.780e-05 5.314e-06 -3.350 0.000809 ***
## CLM_FREQ 2.204e-01 3.265e-02 6.751 1.47e-11 ***
## MVR_PTS 1.122e-01 1.637e-02 6.851 7.32e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7568.3 on 6528 degrees of freedom
## Residual deviance: 5862.0 on 6497 degrees of freedom
## AIC: 5926
##
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(step.model_FLAG))),2)
## Estimate
## (Intercept) 0.09
## PARENT1Yes 1.42
## MSTATUSYes 0.64
## EDUCATIONBachelors 0.68
## EDUCATIONHigh School 1.03
## EDUCATIONMasters 0.70
## EDUCATIONPhD 0.91
## JOBClerical 1.11
## JOBDoctor 0.48
## JOBHome Maker 0.90
## JOBLawyer 0.88
## JOBManager 0.45
## JOBProfessional 0.88
## JOBStudent 0.93
## CAR_USEPrivate 0.46
## CAR_TYPEPanel Truck 1.78
## CAR_TYPEPickup 1.77
## CAR_TYPESports Car 2.69
## CAR_TYPESUV 2.06
## CAR_TYPEVan 2.03
## REVOKEDYes 2.58
## URBANICITYHighly Urban/ Urban 10.90
## KIDSDRIV 1.94
## AGE 0.99
## INCOME 1.00
## HOME_VAL 1.00
## TRAVTIME 1.02
## BLUEBOOK 1.00
## TIF 0.94
## OLDCLAIM 1.00
## CLM_FREQ 1.25
## MVR_PTS 1.12
# evaluate the model by predicting using the testing set
m2_prob <- predict(step.model_FLAG, ntesting, type = "response")
m2_pclass <- ifelse(m2_prob >= 0.5, 1, 0)
# create confusion matrix
pclass <- factor(m2_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 174 89
## 0 240 1129
##
## Accuracy : 0.7984
## 95% CI : (0.7781, 0.8176)
## No Information Rate : 0.7463
## P-Value [Acc > NIR] : 4.255e-07
##
## Kappa : 0.3947
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4203
## Specificity : 0.9269
## Pos Pred Value : 0.6616
## Neg Pred Value : 0.8247
## Prevalence : 0.2537
## Detection Rate : 0.1066
## Detection Prevalence : 0.1612
## Balanced Accuracy : 0.6736
##
## 'Positive' Class : 1
##
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m2_prob),print.auc=TRUE)
# get McFadden
m2_mcFadden <- pR2(step.model_FLAG); m2_mcFadden["McFadden"]
## McFadden
## 0.2254604
## --------------------------------
# Linear Regression - TARGET_AMT
step.model.AMT <- full.model.AMT %>% stepAIC(trace = FALSE)
summary(step.model.AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + SEX + JOB + CAR_USE +
## CAR_TYPE + REVOKED + URBANICITY + KIDSDRIV + INCOME + TRAVTIME +
## BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE,
## data = tr_prep2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5204 -1696 -761 339 103637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.990e+02 3.427e+02 0.581 0.561477
## PARENT1Yes 5.940e+02 1.783e+02 3.331 0.000869 ***
## MSTATUSYes -6.201e+02 1.196e+02 -5.184 2.23e-07 ***
## SEXM 3.182e+02 1.604e+02 1.984 0.047322 *
## JOBClerical 3.265e+01 1.907e+02 0.171 0.864042
## JOBDoctor -5.451e+02 2.860e+02 -1.906 0.056685 .
## JOBHome Maker -1.140e+02 2.465e+02 -0.462 0.643874
## JOBLawyer -1.610e+02 2.166e+02 -0.743 0.457219
## JOBManager -9.717e+02 2.116e+02 -4.592 4.46e-06 ***
## JOBProfessional -1.250e+02 1.967e+02 -0.635 0.525251
## JOBStudent -1.356e+02 2.218e+02 -0.611 0.541042
## CAR_USEPrivate -7.477e+02 1.510e+02 -4.951 7.51e-07 ***
## CAR_TYPEPanel Truck 3.094e+02 2.684e+02 1.153 0.249058
## CAR_TYPEPickup 3.965e+02 1.693e+02 2.342 0.019200 *
## CAR_TYPESports Car 1.029e+03 2.164e+02 4.755 2.02e-06 ***
## CAR_TYPESUV 7.487e+02 1.785e+02 4.194 2.77e-05 ***
## CAR_TYPEVan 5.400e+02 2.114e+02 2.555 0.010648 *
## REVOKEDYes 5.992e+02 1.742e+02 3.439 0.000587 ***
## URBANICITYHighly Urban/ Urban 1.667e+03 1.391e+02 11.985 < 2e-16 ***
## KIDSDRIV 7.014e+02 1.620e+02 4.330 1.51e-05 ***
## INCOME -5.499e-03 1.838e-03 -2.991 0.002785 **
## TRAVTIME 1.274e+01 3.338e+00 3.816 0.000137 ***
## BLUEBOOK 1.391e-02 8.854e-03 1.571 0.116175
## TIF -5.065e+01 1.293e+01 -3.917 9.05e-05 ***
## OLDCLAIM -1.693e-02 9.114e-03 -1.858 0.063258 .
## CLM_FREQ 1.688e+02 5.681e+01 2.972 0.002968 **
## MVR_PTS 1.739e+02 2.786e+01 6.241 4.56e-10 ***
## CAR_AGE -2.765e+01 1.130e+01 -2.447 0.014435 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4544 on 8133 degrees of freedom
## Multiple R-squared: 0.06987, Adjusted R-squared: 0.06678
## F-statistic: 22.63 on 27 and 8133 DF, p-value: < 2.2e-16
vif(step.model.AMT)
## GVIF Df GVIF^(1/(2*Df))
## PARENT1 1.439404 1 1.199751
## MSTATUS 1.357795 1 1.165245
## SEX 2.529123 1 1.590322
## JOB 4.392786 7 1.111501
## CAR_USE 2.103321 1 1.450283
## CAR_TYPE 5.046346 5 1.175703
## REVOKED 1.289951 1 1.135760
## URBANICITY 1.243742 1 1.115232
## KIDSDRIV 1.096511 1 1.047144
## INCOME 2.234825 1 1.494933
## TRAVTIME 1.033216 1 1.016472
## BLUEBOOK 1.986772 1 1.409529
## TIF 1.005226 1 1.002610
## OLDCLAIM 1.825934 1 1.351271
## CLM_FREQ 1.711751 1 1.308339
## MVR_PTS 1.226375 1 1.107418
## CAR_AGE 1.524544 1 1.234724
par(mfrow=c(2,2))
plot(step.model.AMT)
par(mfrow=c(1,1))
# Logistic Regression build the model using training set
select.model_FLAG <- glm(TARGET_FLAG ~.
-TARGET_AMT
-EDUCATION
-SEX
-RED_CAR
-KIDSDRIV
-AGE
-HOMEKIDS
-YOJ
-HOME_VAL
-OLDCLAIM
-BLUEBOOK
, data = ntraining , family = binomial)
summary(select.model_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT - EDUCATION - SEX -
## RED_CAR - KIDSDRIV - AGE - HOMEKIDS - YOJ - HOME_VAL - OLDCLAIM -
## BLUEBOOK, family = binomial, data = ntraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2107 -0.7298 -0.4161 0.6879 3.0244
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.942e+00 1.998e-01 -14.728 < 2e-16 ***
## PARENT1Yes 5.937e-01 1.013e-01 5.860 4.63e-09 ***
## MSTATUSYes -4.950e-01 7.459e-02 -6.636 3.23e-11 ***
## JOBClerical 1.235e-01 1.169e-01 1.056 0.290784
## JOBDoctor -6.608e-01 1.900e-01 -3.477 0.000507 ***
## JOBHome Maker -2.078e-01 1.514e-01 -1.373 0.169867
## JOBLawyer -3.432e-01 1.359e-01 -2.526 0.011531 *
## JOBManager -9.475e-01 1.375e-01 -6.890 5.59e-12 ***
## JOBProfessional -3.489e-01 1.229e-01 -2.838 0.004535 **
## JOBStudent 5.239e-02 1.341e-01 0.391 0.695921
## CAR_USEPrivate -7.046e-01 9.120e-02 -7.725 1.12e-14 ***
## CAR_TYPEPanel Truck 3.845e-01 1.494e-01 2.574 0.010057 *
## CAR_TYPEPickup 6.319e-01 1.088e-01 5.808 6.32e-09 ***
## CAR_TYPESports Car 1.018e+00 1.176e-01 8.653 < 2e-16 ***
## CAR_TYPESUV 7.800e-01 9.443e-02 8.259 < 2e-16 ***
## CAR_TYPEVan 6.278e-01 1.307e-01 4.803 1.56e-06 ***
## REVOKEDYes 7.982e-01 8.819e-02 9.051 < 2e-16 ***
## URBANICITYHighly Urban/ Urban 2.315e+00 1.237e-01 18.707 < 2e-16 ***
## INCOME -7.539e-06 1.217e-06 -6.195 5.82e-10 ***
## TRAVTIME 1.545e-02 2.145e-03 7.204 5.87e-13 ***
## TIF -5.976e-02 8.511e-03 -7.021 2.20e-12 ***
## CLM_FREQ 1.746e-01 2.804e-02 6.226 4.78e-10 ***
## MVR_PTS 1.084e-01 1.611e-02 6.728 1.72e-11 ***
## CAR_AGE -9.808e-03 7.267e-03 -1.350 0.177145
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7568.3 on 6528 degrees of freedom
## Residual deviance: 5970.6 on 6505 degrees of freedom
## AIC: 6018.6
##
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(select.model_FLAG))),2)
## Estimate
## (Intercept) 0.05
## PARENT1Yes 1.81
## MSTATUSYes 0.61
## JOBClerical 1.13
## JOBDoctor 0.52
## JOBHome Maker 0.81
## JOBLawyer 0.71
## JOBManager 0.39
## JOBProfessional 0.71
## JOBStudent 1.05
## CAR_USEPrivate 0.49
## CAR_TYPEPanel Truck 1.47
## CAR_TYPEPickup 1.88
## CAR_TYPESports Car 2.77
## CAR_TYPESUV 2.18
## CAR_TYPEVan 1.87
## REVOKEDYes 2.22
## URBANICITYHighly Urban/ Urban 10.12
## INCOME 1.00
## TRAVTIME 1.02
## TIF 0.94
## CLM_FREQ 1.19
## MVR_PTS 1.11
## CAR_AGE 0.99
# evaluate the model by predicting using the testing set
m3_prob <- predict(select.model_FLAG, ntesting, type = "response")
m3_pclass <- ifelse(m3_prob >= 0.5, 1, 0)
# create confusion matrix
pclass <- factor(m3_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 160 89
## 0 254 1129
##
## Accuracy : 0.7898
## 95% CI : (0.7692, 0.8094)
## No Information Rate : 0.7463
## P-Value [Acc > NIR] : 2.172e-05
##
## Kappa : 0.3609
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.38647
## Specificity : 0.92693
## Pos Pred Value : 0.64257
## Neg Pred Value : 0.81634
## Prevalence : 0.25368
## Detection Rate : 0.09804
## Detection Prevalence : 0.15257
## Balanced Accuracy : 0.65670
##
## 'Positive' Class : 1
##
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m3_prob),print.auc=TRUE)
# get McFadden
m3_mcFadden <- pR2(select.model_FLAG); m3_mcFadden["McFadden"]
## McFadden
## 0.2111078
## --------------------------------
# Linear Regression - TARGET_AMT
select.model.AMT <- lm(TARGET_AMT ~.
-TARGET_FLAG
-EDUCATION
-SEX
-RED_CAR
-KIDSDRIV
-AGE
-HOMEKIDS
-YOJ
-HOME_VAL
-OLDCLAIM
-BLUEBOOK
, data = tr_prep2)
summary(select.model.AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ . - TARGET_FLAG - EDUCATION - SEX -
## RED_CAR - KIDSDRIV - AGE - HOMEKIDS - YOJ - HOME_VAL - OLDCLAIM -
## BLUEBOOK, data = tr_prep2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4919 -1694 -764 329 103709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.323e+02 2.756e+02 2.294 0.021833 *
## PARENT1Yes 7.893e+02 1.716e+02 4.600 4.29e-06 ***
## MSTATUSYes -5.333e+02 1.180e+02 -4.521 6.23e-06 ***
## JOBClerical 1.389e+01 1.908e+02 0.073 0.941986
## JOBDoctor -5.647e+02 2.861e+02 -1.974 0.048433 *
## JOBHome Maker -1.995e+02 2.438e+02 -0.818 0.413208
## JOBLawyer -1.830e+02 2.167e+02 -0.844 0.398476
## JOBManager -9.853e+02 2.118e+02 -4.653 3.32e-06 ***
## JOBProfessional -1.459e+02 1.967e+02 -0.742 0.458352
## JOBStudent -1.712e+02 2.216e+02 -0.773 0.439748
## CAR_USEPrivate -7.361e+02 1.511e+02 -4.871 1.13e-06 ***
## CAR_TYPEPanel Truck 5.732e+02 2.388e+02 2.400 0.016396 *
## CAR_TYPEPickup 3.797e+02 1.680e+02 2.260 0.023828 *
## CAR_TYPESports Car 7.770e+02 1.831e+02 4.243 2.23e-05 ***
## CAR_TYPESUV 5.240e+02 1.389e+02 3.771 0.000164 ***
## CAR_TYPEVan 6.602e+02 2.038e+02 3.239 0.001202 **
## REVOKEDYes 4.733e+02 1.550e+02 3.054 0.002266 **
## URBANICITYHighly Urban/ Urban 1.646e+03 1.392e+02 11.822 < 2e-16 ***
## INCOME -5.164e-03 1.805e-03 -2.860 0.004242 **
## TRAVTIME 1.292e+01 3.341e+00 3.866 0.000111 ***
## TIF -5.077e+01 1.295e+01 -3.921 8.89e-05 ***
## CLM_FREQ 1.235e+02 4.891e+01 2.524 0.011616 *
## MVR_PTS 1.696e+02 2.775e+01 6.115 1.01e-09 ***
## CAR_AGE -2.778e+01 1.131e+01 -2.455 0.014105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4551 on 8137 degrees of freedom
## Multiple R-squared: 0.06681, Adjusted R-squared: 0.06418
## F-statistic: 25.33 on 23 and 8137 DF, p-value: < 2.2e-16
vif(select.model.AMT)
## GVIF Df GVIF^(1/(2*Df))
## PARENT1 1.329311 1 1.152957
## MSTATUS 1.316334 1 1.147316
## JOB 4.219698 7 1.108314
## CAR_USE 2.100340 1 1.449255
## CAR_TYPE 1.803986 5 1.060775
## REVOKED 1.017562 1 1.008743
## URBANICITY 1.242415 1 1.114637
## INCOME 2.149521 1 1.466124
## TRAVTIME 1.032640 1 1.016189
## TIF 1.005153 1 1.002573
## CLM_FREQ 1.265076 1 1.124756
## MVR_PTS 1.212759 1 1.101253
## CAR_AGE 1.524066 1 1.234531
par(mfrow=c(2,2))
plot(select.model.AMT)
par(mfrow=c(1,1))
Although not a big difference in the performance of all 3 models, the stepwise model performs slightly better with lowest AIC and lowest difference between multiple and adjusted R-squared, with acceptable accuracy, recall, precision, and roc measure.
The second model will be used to predict the evaluation dataset.
# Read the evaluation dataset
eval_df <- read.csv("https://raw.githubusercontent.com/L-Velasco/DATA621_FA18/master/HW4/insurance-evaluation-data.csv", stringsAsFactors = FALSE)
# Remove columns not selected in 2nd model
#eval_df <- dplyr::select(eval_df, -YOJ, -MSTATUS, -RED_CAR)
# Convert to numeric
eval_df$INCOME <- as.numeric(gsub('[$,]', '', eval_df$INCOME))
eval_df$HOME_VAL <- as.numeric(gsub('[$,]', '', eval_df$HOME_VAL))
eval_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', eval_df$BLUEBOOK))
eval_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', eval_df$OLDCLAIM))
# Remove irrelevant characters
eval_df$MSTATUS <- gsub("z_", "", eval_df$MSTATUS)
eval_df$SEX <- gsub("z_", "", eval_df$SEX)
eval_df$EDUCATION <- gsub("z_", "", eval_df$EDUCATION)
eval_df$JOB <- gsub("z_", "", eval_df$JOB)
eval_df$CAR_USE <- gsub("z_", "", eval_df$CAR_USE)
eval_df$CAR_TYPE <- gsub("z_", "", eval_df$CAR_TYPE)
eval_df$URBANICITY <- gsub("z_", "", eval_df$URBANICITY)
#impute
eval_df = eval_df %>%
mutate(AGE =
ifelse(is.na(AGE),
mean(AGE, na.rm=TRUE), AGE)) %>%
mutate(YOJ =
ifelse(is.na(YOJ),
mean(YOJ, na.rm=TRUE), YOJ)) %>%
mutate(INCOME =
ifelse(is.na(INCOME),
median(INCOME, na.rm=TRUE), INCOME)) %>%
mutate(HOME_VAL =
ifelse(is.na(HOME_VAL),
mean(HOME_VAL, na.rm=TRUE), HOME_VAL)) %>%
mutate(CAR_AGE =
ifelse(is.na(CAR_AGE),
mean(CAR_AGE, na.rm=TRUE), CAR_AGE)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'PhD'),
"Doctor", JOB)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'Masters'),
"Lawyer", JOB))
eval_prob <- predict(step.model_FLAG, eval_df, type = "response")
eval_pclass <- ifelse(eval_prob >= 0.5, 1, 0)
eval_amt <- ifelse(eval_pclass == 1, predict(step.model.AMT, eval_df, type = "response"), 0)
eval_df$TARGET_FLAG <- eval_pclass
eval_df$TARGET_AMT <- eval_amt
# Export
write.csv(eval_df,file="HW4_Auto Insurance.csv")