DATA EXPLORATION
# Read data
train_df <- read.csv("https://raw.githubusercontent.com/monuchacko/cuny_msds/master/data_621/Homework4/insurance_training_data.csv", stringsAsFactors = FALSE)
dim(train_df)
## [1] 8161 26
str(train_df)
## 'data.frame': 8161 obs. of 26 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : int 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : int 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : chr "$67,349" "$91,449" "$16,039" "" ...
## $ PARENT1 : chr "No" "No" "No" "No" ...
## $ HOME_VAL : chr "$0" "$257,252" "$124,191" "$306,251" ...
## $ MSTATUS : chr "z_No" "z_No" "Yes" "Yes" ...
## $ SEX : chr "M" "M" "z_F" "M" ...
## $ EDUCATION : chr "PhD" "z_High School" "z_High School" "<High School" ...
## $ JOB : chr "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : chr "Private" "Commercial" "Private" "Private" ...
## $ BLUEBOOK : chr "$14,230" "$14,940" "$4,010" "$15,440" ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : chr "Minivan" "Minivan" "z_SUV" "Minivan" ...
## $ RED_CAR : chr "yes" "yes" "no" "yes" ...
## $ OLDCLAIM : chr "$4,461" "$0" "$38,690" "$0" ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : chr "No" "No" "No" "No" ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : chr "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
head(train_df)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1 1 0 0 0 60 0 11 $67,349 No
## 2 2 0 0 0 43 0 11 $91,449 No
## 3 4 0 0 0 35 1 10 $16,039 No
## 4 5 0 0 0 51 0 14 No
## 5 6 0 0 0 50 0 NA $114,986 No
## 6 7 1 2946 0 34 1 12 $125,301 Yes
## HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK
## 1 $0 z_No M PhD Professional 14 Private $14,230
## 2 $257,252 z_No M z_High School z_Blue Collar 22 Commercial $14,940
## 3 $124,191 Yes z_F z_High School Clerical 5 Private $4,010
## 4 $306,251 Yes M <High School z_Blue Collar 32 Private $15,440
## 5 $243,925 Yes z_F PhD Doctor 36 Private $18,000
## 6 $0 z_No z_F Bachelors z_Blue Collar 46 Commercial $17,430
## TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 11 Minivan yes $4,461 2 No 3 18
## 2 1 Minivan yes $0 0 No 0 1
## 3 4 z_SUV no $38,690 2 No 3 10
## 4 7 Minivan yes $0 0 No 0 6
## 5 1 z_SUV no $19,217 2 Yes 3 17
## 6 1 Sports Car no $0 0 No 0 7
## URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Urban/ Urban
## 4 Highly Urban/ Urban
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban
# Exclude the INDEX column
tr <- train_df[-1]
# Convert to numeric
tr$INCOME <- as.numeric(gsub('[$,]', '', tr$INCOME))
tr$HOME_VAL <- as.numeric(gsub('[$,]', '', tr$HOME_VAL))
tr$BLUEBOOK <- as.numeric(gsub('[$,]', '', tr$BLUEBOOK))
tr$OLDCLAIM <- as.numeric(gsub('[$,]', '', tr$OLDCLAIM))
# Remove characters that are not required
tr$MSTATUS <- gsub("z_", "", tr$MSTATUS)
tr$SEX <- gsub("z_", "", tr$SEX)
tr$EDUCATION <- gsub("z_", "", tr$EDUCATION)
tr$JOB <- gsub("z_", "", tr$JOB)
tr$CAR_USE <- gsub("z_", "", tr$CAR_USE)
tr$CAR_TYPE <- gsub("z_", "", tr$CAR_TYPE)
tr$URBANICITY <- gsub("z_", "", tr$URBANICITY)
# Reorder columns -- predictor categorical, predictor numeric, target
indx <- c(8, 10:13, 15, 18:19, 22, 25, 3:7, 9, 14, 16:17, 20:21, 23:24, 1:2)
tr_ordered <- tr
setcolorder(tr_ordered,indx)
Examine the data
table(tr$PARENT1)
##
## No Yes
## 7084 1077
table(tr$MSTATUS)
##
## No Yes
## 3267 4894
table(tr$SEX)
##
## F M
## 4375 3786
table(tr$EDUCATION)
##
## <High School Bachelors High School Masters PhD
## 1203 2242 2330 1658 728
table(tr$JOB)
##
## Blue Collar Clerical Doctor Home Maker Lawyer
## 526 1825 1271 246 641 835
## Manager Professional Student
## 988 1117 712
table(tr$CAR_USE)
##
## Commercial Private
## 3029 5132
table(tr$CAR_TYPE)
##
## Minivan Panel Truck Pickup Sports Car SUV Van
## 2145 676 1389 907 2294 750
table(tr$RED_CAR)
##
## no yes
## 5783 2378
table(tr$REVOKED)
##
## No Yes
## 7161 1000
table(tr$URBANICITY)
##
## Highly Rural/ Rural Highly Urban/ Urban
## 1669 6492
Summary of the data
summary(tr)
## PARENT1 MSTATUS SEX EDUCATION
## Length:8161 Length:8161 Length:8161 Length:8161
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## JOB CAR_USE CAR_TYPE RED_CAR
## Length:8161 Length:8161 Length:8161 Length:8161
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## REVOKED URBANICITY KIDSDRIV AGE
## Length:8161 Length:8161 Min. :0.0000 Min. :16.00
## Class :character Class :character 1st Qu.:0.0000 1st Qu.:39.00
## Mode :character Mode :character Median :0.0000 Median :45.00
## Mean :0.1711 Mean :44.79
## 3rd Qu.:0.0000 3rd Qu.:51.00
## Max. :4.0000 Max. :81.00
## NA's :6
## HOMEKIDS YOJ INCOME HOME_VAL
## Min. :0.0000 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.:0.0000 1st Qu.: 9.0 1st Qu.: 28097 1st Qu.: 0
## Median :0.0000 Median :11.0 Median : 54028 Median :161160
## Mean :0.7212 Mean :10.5 Mean : 61898 Mean :154867
## 3rd Qu.:1.0000 3rd Qu.:13.0 3rd Qu.: 85986 3rd Qu.:238724
## Max. :5.0000 Max. :23.0 Max. :367030 Max. :885282
## NA's :454 NA's :445 NA's :464
## TRAVTIME BLUEBOOK TIF OLDCLAIM
## Min. : 5.00 Min. : 1500 Min. : 1.000 Min. : 0
## 1st Qu.: 22.00 1st Qu.: 9280 1st Qu.: 1.000 1st Qu.: 0
## Median : 33.00 Median :14440 Median : 4.000 Median : 0
## Mean : 33.49 Mean :15710 Mean : 5.351 Mean : 4037
## 3rd Qu.: 44.00 3rd Qu.:20850 3rd Qu.: 7.000 3rd Qu.: 4636
## Max. :142.00 Max. :69740 Max. :25.000 Max. :57037
##
## CLM_FREQ MVR_PTS CAR_AGE TARGET_FLAG
## Min. :0.0000 Min. : 0.000 Min. :-3.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.:0.0000
## Median :0.0000 Median : 1.000 Median : 8.000 Median :0.0000
## Mean :0.7986 Mean : 1.696 Mean : 8.328 Mean :0.2638
## 3rd Qu.:2.0000 3rd Qu.: 3.000 3rd Qu.:12.000 3rd Qu.:1.0000
## Max. :5.0000 Max. :13.000 Max. :28.000 Max. :1.0000
## NA's :510
## TARGET_AMT
## Min. : 0
## 1st Qu.: 0
## Median : 0
## Mean : 1504
## 3rd Qu.: 1036
## Max. :107586
##
boxplot((INCOME/1000)~AGE,data=tr, main="Income vs Age", xlab="Age", ylab="Income")

DATA PREPARATION
tr_prep <- tr_ordered
M <- sapply(tr_prep, function(x) sum(x=="") | sum(is.na(x))); names(M[(M>0)])
## [1] "JOB" "AGE" "YOJ" "INCOME" "HOME_VAL" "CAR_AGE"
x <- c(12, 14, 15, 16, 23)
par(mfrow=c(2,3))
for (val in x) {
hist(tr_prep[,val],xlab=names(tr_prep[val]), main="")
}
par(mfrow=c(1,1))

#impute
tr_prep = tr_prep %>%
mutate(AGE =
ifelse(is.na(AGE),
mean(AGE, na.rm=TRUE), AGE)) %>%
mutate(YOJ =
ifelse(is.na(YOJ),
mean(YOJ, na.rm=TRUE), YOJ)) %>%
mutate(INCOME =
ifelse(is.na(INCOME),
median(INCOME, na.rm=TRUE), INCOME)) %>%
mutate(HOME_VAL =
ifelse(is.na(HOME_VAL),
mean(HOME_VAL, na.rm=TRUE), HOME_VAL)) %>%
mutate(CAR_AGE =
ifelse(is.na(CAR_AGE),
mean(CAR_AGE, na.rm=TRUE), CAR_AGE)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'PhD'),
"Doctor", JOB)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'Masters'),
"Lawyer", JOB))
M <- sapply(tr_prep, function(x) sum(x=="") | sum(is.na(x))); names(M[(M>0)])
## character(0)
# Outlier Capping
tr_prep2 <- tr_prep
id <- c(11:23)
for (val in id) {
qnt <- quantile(tr_prep2[,val], probs=c(.25, .75), na.rm = T)
caps <- quantile(tr_prep2[,val], probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(tr_prep2[,val], na.rm = T)
tr_prep2[,val][tr_prep2[,val] < (qnt[1] - H)] <- caps[1]
tr_prep2[,val][tr_prep2[,val] > (qnt[2] + H)] <- caps[2]
}
BUILD MODELS
nTrain <- createDataPartition(tr_prep2$TARGET_FLAG, p=0.8, list=FALSE)
ntraining <- tr_prep2[nTrain,]
ntesting <- tr_prep2[-nTrain,]
set.seed(123)
# Logistic Regression build the model using training set
full.model_FLAG <- glm(TARGET_FLAG ~.-TARGET_AMT, data = ntraining , family = binomial)
summary(full.model_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial,
## data = ntraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3161 -0.6996 -0.3866 0.5829 3.1880
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.347e+00 3.232e-01 -7.263 3.79e-13 ***
## PARENT1Yes 3.378e-01 1.246e-01 2.710 0.006729 **
## MSTATUSYes -5.268e-01 9.507e-02 -5.542 3.00e-08 ***
## SEXM -6.960e-02 1.255e-01 -0.555 0.579120
## EDUCATIONBachelors -4.536e-01 1.306e-01 -3.473 0.000514 ***
## EDUCATIONHigh School 1.049e-02 1.066e-01 0.098 0.921623
## EDUCATIONMasters -4.873e-01 2.105e-01 -2.315 0.020602 *
## EDUCATIONPhD -1.029e-01 2.560e-01 -0.402 0.687697
## JOBClerical 1.240e-01 1.193e-01 1.040 0.298474
## JOBDoctor -7.034e-01 2.788e-01 -2.523 0.011627 *
## JOBHome Maker 4.766e-02 1.740e-01 0.274 0.784083
## JOBLawyer -3.347e-02 2.006e-01 -0.167 0.867500
## JOBManager -9.087e-01 1.605e-01 -5.663 1.49e-08 ***
## JOBProfessional -4.674e-02 1.345e-01 -0.348 0.728197
## JOBStudent -1.301e-01 1.479e-01 -0.879 0.379223
## CAR_USEPrivate -8.176e-01 9.990e-02 -8.184 2.75e-16 ***
## CAR_TYPEPanel Truck 7.100e-01 1.790e-01 3.966 7.30e-05 ***
## CAR_TYPEPickup 5.226e-01 1.134e-01 4.610 4.03e-06 ***
## CAR_TYPESports Car 9.273e-01 1.465e-01 6.329 2.47e-10 ***
## CAR_TYPESUV 7.029e-01 1.242e-01 5.661 1.51e-08 ***
## CAR_TYPEVan 6.584e-01 1.433e-01 4.594 4.36e-06 ***
## RED_CARyes 5.826e-02 9.762e-02 0.597 0.550626
## REVOKEDYes 8.725e-01 1.046e-01 8.341 < 2e-16 ***
## URBANICITYHighly Urban/ Urban 2.413e+00 1.248e-01 19.335 < 2e-16 ***
## KIDSDRIV 6.506e-01 1.095e-01 5.943 2.80e-09 ***
## AGE -2.491e-03 4.636e-03 -0.537 0.591023
## HOMEKIDS 6.513e-02 4.490e-02 1.451 0.146862
## YOJ -9.775e-03 9.720e-03 -1.006 0.314574
## INCOME -3.687e-06 1.377e-06 -2.677 0.007430 **
## HOME_VAL -1.352e-06 3.871e-07 -3.492 0.000480 ***
## TRAVTIME 1.677e-02 2.202e-03 7.616 2.61e-14 ***
## BLUEBOOK -3.041e-05 6.153e-06 -4.942 7.73e-07 ***
## TIF -6.085e-02 8.712e-03 -6.985 2.85e-12 ***
## OLDCLAIM -1.275e-05 5.401e-06 -2.360 0.018260 *
## CLM_FREQ 2.044e-01 3.324e-02 6.148 7.84e-10 ***
## MVR_PTS 1.127e-01 1.651e-02 6.826 8.72e-12 ***
## CAR_AGE -1.653e-03 8.570e-03 -0.193 0.847054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7502.7 on 6528 degrees of freedom
## Residual deviance: 5730.3 on 6492 degrees of freedom
## AIC: 5804.3
##
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(full.model_FLAG))),2)
## Estimate
## (Intercept) 0.10
## PARENT1Yes 1.40
## MSTATUSYes 0.59
## SEXM 0.93
## EDUCATIONBachelors 0.64
## EDUCATIONHigh School 1.01
## EDUCATIONMasters 0.61
## EDUCATIONPhD 0.90
## JOBClerical 1.13
## JOBDoctor 0.49
## JOBHome Maker 1.05
## JOBLawyer 0.97
## JOBManager 0.40
## JOBProfessional 0.95
## JOBStudent 0.88
## CAR_USEPrivate 0.44
## CAR_TYPEPanel Truck 2.03
## CAR_TYPEPickup 1.69
## CAR_TYPESports Car 2.53
## CAR_TYPESUV 2.02
## CAR_TYPEVan 1.93
## RED_CARyes 1.06
## REVOKEDYes 2.39
## URBANICITYHighly Urban/ Urban 11.16
## KIDSDRIV 1.92
## AGE 1.00
## HOMEKIDS 1.07
## YOJ 0.99
## INCOME 1.00
## HOME_VAL 1.00
## TRAVTIME 1.02
## BLUEBOOK 1.00
## TIF 0.94
## OLDCLAIM 1.00
## CLM_FREQ 1.23
## MVR_PTS 1.12
## CAR_AGE 1.00
# evaluate the model by predicting using the testing set
m1_prob <- predict(full.model_FLAG, ntesting, type = "response")
m1_pclass <- ifelse(m1_prob >= 0.5, 1, 0)
# create confusion matrix
pclass <- factor(m1_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 185 111
## 0 261 1075
##
## Accuracy : 0.7721
## 95% CI : (0.7509, 0.7922)
## No Information Rate : 0.7267
## P-Value [Acc > NIR] : 1.625e-05
##
## Kappa : 0.3589
##
## Mcnemar's Test P-Value : 1.116e-14
##
## Sensitivity : 0.4148
## Specificity : 0.9064
## Pos Pred Value : 0.6250
## Neg Pred Value : 0.8046
## Prevalence : 0.2733
## Detection Rate : 0.1134
## Detection Prevalence : 0.1814
## Balanced Accuracy : 0.6606
##
## 'Positive' Class : 1
##
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m1_prob),print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# get McFadden
m1_mcFadden <- pR2(full.model_FLAG); m1_mcFadden["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.2362322
full.model.AMT <- lm(TARGET_AMT ~. -TARGET_FLAG, data = tr_prep2)
summary(full.model.AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ . - TARGET_FLAG, data = tr_prep2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5169 -1704 -754 344 103686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.821e+01 4.779e+02 0.205 0.837204
## PARENT1Yes 5.364e+02 2.043e+02 2.626 0.008654 **
## MSTATUSYes -5.759e+02 1.453e+02 -3.963 7.47e-05 ***
## SEXM 3.471e+02 1.833e+02 1.894 0.058294 .
## EDUCATIONBachelors -2.659e+02 2.055e+02 -1.294 0.195735
## EDUCATIONHigh School -9.582e+01 1.716e+02 -0.558 0.576571
## EDUCATIONMasters -4.219e+01 3.061e+02 -0.138 0.890376
## EDUCATIONPhD 3.898e+02 3.757e+02 1.038 0.299462
## JOBClerical 1.773e+01 1.917e+02 0.093 0.926295
## JOBDoctor -1.046e+03 4.009e+02 -2.608 0.009117 **
## JOBHome Maker -1.745e+02 2.684e+02 -0.650 0.515478
## JOBLawyer -2.484e+02 2.928e+02 -0.848 0.396242
## JOBManager -9.855e+02 2.328e+02 -4.233 2.33e-05 ***
## JOBProfessional -2.882e+01 2.117e+02 -0.136 0.891724
## JOBStudent -2.275e+02 2.358e+02 -0.965 0.334779
## CAR_USEPrivate -8.000e+02 1.578e+02 -5.071 4.05e-07 ***
## CAR_TYPEPanel Truck 2.668e+02 2.724e+02 0.979 0.327500
## CAR_TYPEPickup 3.707e+02 1.706e+02 2.172 0.029869 *
## CAR_TYPESports Car 1.014e+03 2.178e+02 4.656 3.28e-06 ***
## CAR_TYPESUV 7.434e+02 1.793e+02 4.145 3.43e-05 ***
## CAR_TYPEVan 5.220e+02 2.127e+02 2.454 0.014162 *
## RED_CARyes -3.741e+01 1.491e+02 -0.251 0.801877
## REVOKEDYes 5.906e+02 1.743e+02 3.387 0.000709 ***
## URBANICITYHighly Urban/ Urban 1.675e+03 1.392e+02 12.032 < 2e-16 ***
## KIDSDRIV 6.104e+02 1.789e+02 3.411 0.000650 ***
## AGE 6.643e+00 7.158e+00 0.928 0.353395
## HOMEKIDS 8.003e+01 6.981e+01 1.146 0.251677
## YOJ -5.048e+00 1.503e+01 -0.336 0.737007
## INCOME -4.652e-03 2.081e-03 -2.236 0.025408 *
## HOME_VAL -6.471e-04 5.961e-04 -1.085 0.277733
## TRAVTIME 1.285e+01 3.340e+00 3.847 0.000120 ***
## BLUEBOOK 1.357e-02 8.954e-03 1.515 0.129769
## TIF -5.120e+01 1.294e+01 -3.958 7.62e-05 ***
## OLDCLAIM -1.685e-02 9.119e-03 -1.848 0.064627 .
## CLM_FREQ 1.667e+02 5.686e+01 2.932 0.003381 **
## MVR_PTS 1.731e+02 2.790e+01 6.203 5.79e-10 ***
## CAR_AGE -2.708e+01 1.286e+01 -2.105 0.035304 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4544 on 8124 degrees of freedom
## Multiple R-squared: 0.0709, Adjusted R-squared: 0.06679
## F-statistic: 17.22 on 36 and 8124 DF, p-value: < 2.2e-16
vif(full.model.AMT)
## GVIF Df GVIF^(1/(2*Df))
## PARENT1 1.888802 1 1.374337
## MSTATUS 2.003847 1 1.415573
## SEX 3.302562 1 1.817295
## EDUCATION 15.755173 4 1.411490
## JOB 29.731384 7 1.274177
## CAR_USE 2.296181 1 1.515316
## CAR_TYPE 5.324276 5 1.182023
## RED_CAR 1.813330 1 1.346599
## REVOKED 1.291470 1 1.136429
## URBANICITY 1.245363 1 1.115958
## KIDSDRIV 1.338024 1 1.156730
## AGE 1.462748 1 1.209441
## HOMEKIDS 2.140216 1 1.462948
## YOJ 1.416953 1 1.190358
## INCOME 2.863846 1 1.692290
## HOME_VAL 2.143594 1 1.464102
## TRAVTIME 1.034456 1 1.017082
## BLUEBOOK 2.032225 1 1.425561
## TIF 1.006054 1 1.003022
## OLDCLAIM 1.827781 1 1.351954
## CLM_FREQ 1.714439 1 1.309366
## MVR_PTS 1.229565 1 1.108858
## CAR_AGE 1.975294 1 1.405451
par(mfrow=c(2,2))
plot(full.model.AMT)

par(mfrow=c(1,1))
3.2 Stepwise variable selection
# Logistic Regression build the model using training set
step.model_FLAG <- full.model_FLAG %>% stepAIC(trace = FALSE)
summary(step.model_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ PARENT1 + MSTATUS + EDUCATION + JOB +
## CAR_USE + CAR_TYPE + REVOKED + URBANICITY + KIDSDRIV + HOMEKIDS +
## INCOME + HOME_VAL + TRAVTIME + BLUEBOOK + TIF + OLDCLAIM +
## CLM_FREQ + MVR_PTS, family = binomial, data = ntraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3178 -0.7000 -0.3857 0.5847 3.1757
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.586e+00 2.255e-01 -11.466 < 2e-16 ***
## PARENT1Yes 3.484e-01 1.240e-01 2.809 0.004966 **
## MSTATUSYes -5.377e-01 9.465e-02 -5.681 1.34e-08 ***
## EDUCATIONBachelors -4.577e-01 1.226e-01 -3.733 0.000190 ***
## EDUCATIONHigh School 9.879e-03 1.061e-01 0.093 0.925849
## EDUCATIONMasters -5.031e-01 1.917e-01 -2.625 0.008666 **
## EDUCATIONPhD -1.148e-01 2.417e-01 -0.475 0.634809
## JOBClerical 1.306e-01 1.190e-01 1.097 0.272672
## JOBDoctor -7.071e-01 2.784e-01 -2.540 0.011098 *
## JOBHome Maker 1.067e-01 1.633e-01 0.653 0.513440
## JOBLawyer -3.267e-02 2.005e-01 -0.163 0.870582
## JOBManager -9.098e-01 1.601e-01 -5.682 1.33e-08 ***
## JOBProfessional -4.383e-02 1.343e-01 -0.326 0.744181
## JOBStudent -8.281e-02 1.417e-01 -0.584 0.558986
## CAR_USEPrivate -8.225e-01 9.984e-02 -8.239 < 2e-16 ***
## CAR_TYPEPanel Truck 6.970e-01 1.673e-01 4.165 3.12e-05 ***
## CAR_TYPEPickup 5.225e-01 1.134e-01 4.609 4.05e-06 ***
## CAR_TYPESports Car 9.504e-01 1.227e-01 7.746 9.49e-15 ***
## CAR_TYPESUV 7.232e-01 9.644e-02 7.499 6.42e-14 ***
## CAR_TYPEVan 6.516e-01 1.382e-01 4.715 2.42e-06 ***
## REVOKEDYes 8.715e-01 1.046e-01 8.335 < 2e-16 ***
## URBANICITYHighly Urban/ Urban 2.411e+00 1.247e-01 19.341 < 2e-16 ***
## KIDSDRIV 6.400e-01 1.078e-01 5.936 2.92e-09 ***
## HOMEKIDS 6.882e-02 4.108e-02 1.675 0.093875 .
## INCOME -3.789e-06 1.371e-06 -2.764 0.005718 **
## HOME_VAL -1.380e-06 3.860e-07 -3.576 0.000349 ***
## TRAVTIME 1.677e-02 2.200e-03 7.619 2.55e-14 ***
## BLUEBOOK -3.023e-05 5.572e-06 -5.424 5.82e-08 ***
## TIF -6.095e-02 8.707e-03 -7.000 2.55e-12 ***
## OLDCLAIM -1.284e-05 5.394e-06 -2.381 0.017253 *
## CLM_FREQ 2.047e-01 3.321e-02 6.163 7.15e-10 ***
## MVR_PTS 1.136e-01 1.648e-02 6.895 5.38e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7502.7 on 6528 degrees of freedom
## Residual deviance: 5732.4 on 6497 degrees of freedom
## AIC: 5796.4
##
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(step.model_FLAG))),2)
## Estimate
## (Intercept) 0.08
## PARENT1Yes 1.42
## MSTATUSYes 0.58
## EDUCATIONBachelors 0.63
## EDUCATIONHigh School 1.01
## EDUCATIONMasters 0.60
## EDUCATIONPhD 0.89
## JOBClerical 1.14
## JOBDoctor 0.49
## JOBHome Maker 1.11
## JOBLawyer 0.97
## JOBManager 0.40
## JOBProfessional 0.96
## JOBStudent 0.92
## CAR_USEPrivate 0.44
## CAR_TYPEPanel Truck 2.01
## CAR_TYPEPickup 1.69
## CAR_TYPESports Car 2.59
## CAR_TYPESUV 2.06
## CAR_TYPEVan 1.92
## REVOKEDYes 2.39
## URBANICITYHighly Urban/ Urban 11.15
## KIDSDRIV 1.90
## HOMEKIDS 1.07
## INCOME 1.00
## HOME_VAL 1.00
## TRAVTIME 1.02
## BLUEBOOK 1.00
## TIF 0.94
## OLDCLAIM 1.00
## CLM_FREQ 1.23
## MVR_PTS 1.12
# evaluate the model by predicting using the testing set
m2_prob <- predict(step.model_FLAG, ntesting, type = "response")
m2_pclass <- ifelse(m2_prob >= 0.5, 1, 0)
# create confusion matrix
pclass <- factor(m2_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 181 112
## 0 265 1074
##
## Accuracy : 0.769
## 95% CI : (0.7478, 0.7893)
## No Information Rate : 0.7267
## P-Value [Acc > NIR] : 5.512e-05
##
## Kappa : 0.3487
##
## Mcnemar's Test P-Value : 4.941e-15
##
## Sensitivity : 0.4058
## Specificity : 0.9056
## Pos Pred Value : 0.6177
## Neg Pred Value : 0.8021
## Prevalence : 0.2733
## Detection Rate : 0.1109
## Detection Prevalence : 0.1795
## Balanced Accuracy : 0.6557
##
## 'Positive' Class : 1
##
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m2_prob),print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# get McFadden
m2_mcFadden <- pR2(step.model_FLAG); m2_mcFadden["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.2359591
# Linear Regression - TARGET_AMT
step.model.AMT <- full.model.AMT %>% stepAIC(trace = FALSE)
summary(step.model.AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + SEX + JOB + CAR_USE +
## CAR_TYPE + REVOKED + URBANICITY + KIDSDRIV + INCOME + TRAVTIME +
## BLUEBOOK + TIF + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE,
## data = tr_prep2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5204 -1696 -761 339 103637
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.990e+02 3.427e+02 0.581 0.561477
## PARENT1Yes 5.940e+02 1.783e+02 3.331 0.000869 ***
## MSTATUSYes -6.201e+02 1.196e+02 -5.184 2.23e-07 ***
## SEXM 3.182e+02 1.604e+02 1.984 0.047322 *
## JOBClerical 3.265e+01 1.907e+02 0.171 0.864042
## JOBDoctor -5.451e+02 2.860e+02 -1.906 0.056685 .
## JOBHome Maker -1.140e+02 2.465e+02 -0.462 0.643874
## JOBLawyer -1.610e+02 2.166e+02 -0.743 0.457219
## JOBManager -9.717e+02 2.116e+02 -4.592 4.46e-06 ***
## JOBProfessional -1.250e+02 1.967e+02 -0.635 0.525251
## JOBStudent -1.356e+02 2.218e+02 -0.611 0.541042
## CAR_USEPrivate -7.477e+02 1.510e+02 -4.951 7.51e-07 ***
## CAR_TYPEPanel Truck 3.094e+02 2.684e+02 1.153 0.249058
## CAR_TYPEPickup 3.965e+02 1.693e+02 2.342 0.019200 *
## CAR_TYPESports Car 1.029e+03 2.164e+02 4.755 2.02e-06 ***
## CAR_TYPESUV 7.487e+02 1.785e+02 4.194 2.77e-05 ***
## CAR_TYPEVan 5.400e+02 2.114e+02 2.555 0.010648 *
## REVOKEDYes 5.992e+02 1.742e+02 3.439 0.000587 ***
## URBANICITYHighly Urban/ Urban 1.667e+03 1.391e+02 11.985 < 2e-16 ***
## KIDSDRIV 7.014e+02 1.620e+02 4.330 1.51e-05 ***
## INCOME -5.499e-03 1.838e-03 -2.991 0.002785 **
## TRAVTIME 1.274e+01 3.338e+00 3.816 0.000137 ***
## BLUEBOOK 1.391e-02 8.854e-03 1.571 0.116175
## TIF -5.065e+01 1.293e+01 -3.917 9.05e-05 ***
## OLDCLAIM -1.693e-02 9.114e-03 -1.858 0.063258 .
## CLM_FREQ 1.688e+02 5.681e+01 2.972 0.002968 **
## MVR_PTS 1.739e+02 2.786e+01 6.241 4.56e-10 ***
## CAR_AGE -2.765e+01 1.130e+01 -2.447 0.014435 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4544 on 8133 degrees of freedom
## Multiple R-squared: 0.06987, Adjusted R-squared: 0.06678
## F-statistic: 22.63 on 27 and 8133 DF, p-value: < 2.2e-16
vif(step.model.AMT)
## GVIF Df GVIF^(1/(2*Df))
## PARENT1 1.439404 1 1.199751
## MSTATUS 1.357795 1 1.165245
## SEX 2.529123 1 1.590322
## JOB 4.392786 7 1.111501
## CAR_USE 2.103321 1 1.450283
## CAR_TYPE 5.046346 5 1.175703
## REVOKED 1.289951 1 1.135760
## URBANICITY 1.243742 1 1.115232
## KIDSDRIV 1.096511 1 1.047144
## INCOME 2.234825 1 1.494933
## TRAVTIME 1.033216 1 1.016472
## BLUEBOOK 1.986772 1 1.409529
## TIF 1.005226 1 1.002610
## OLDCLAIM 1.825934 1 1.351271
## CLM_FREQ 1.711751 1 1.308339
## MVR_PTS 1.226375 1 1.107418
## CAR_AGE 1.524544 1 1.234724
par(mfrow=c(2,2))
plot(step.model.AMT)

par(mfrow=c(1,1))
3.3 Significant value variable selection
# Logistic Regression build the model using training set
select.model_FLAG <- glm(TARGET_FLAG ~.
-TARGET_AMT
-EDUCATION
-SEX
-RED_CAR
-KIDSDRIV
-AGE
-HOMEKIDS
-YOJ
-HOME_VAL
-OLDCLAIM
-BLUEBOOK
, data = ntraining , family = binomial)
summary(select.model_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT - EDUCATION - SEX -
## RED_CAR - KIDSDRIV - AGE - HOMEKIDS - YOJ - HOME_VAL - OLDCLAIM -
## BLUEBOOK, family = binomial, data = ntraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2415 -0.7167 -0.4082 0.6306 3.0465
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.965e+00 2.001e-01 -14.817 < 2e-16 ***
## PARENT1Yes 6.582e-01 1.013e-01 6.501 8.00e-11 ***
## MSTATUSYes -5.370e-01 7.570e-02 -7.093 1.31e-12 ***
## JOBClerical 1.608e-01 1.174e-01 1.369 0.1709
## JOBDoctor -5.845e-01 1.900e-01 -3.076 0.0021 **
## JOBHome Maker 6.168e-04 1.532e-01 0.004 0.9968
## JOBLawyer -3.043e-01 1.365e-01 -2.228 0.0259 *
## JOBManager -1.092e+00 1.446e-01 -7.553 4.26e-14 ***
## JOBProfessional -2.950e-01 1.232e-01 -2.395 0.0166 *
## JOBStudent 8.221e-02 1.365e-01 0.602 0.5470
## CAR_USEPrivate -7.324e-01 9.341e-02 -7.840 4.49e-15 ***
## CAR_TYPEPanel Truck 3.873e-01 1.504e-01 2.575 0.0100 *
## CAR_TYPEPickup 6.177e-01 1.099e-01 5.619 1.92e-08 ***
## CAR_TYPESports Car 1.022e+00 1.198e-01 8.536 < 2e-16 ***
## CAR_TYPESUV 8.141e-01 9.395e-02 8.666 < 2e-16 ***
## CAR_TYPEVan 5.360e-01 1.339e-01 4.003 6.26e-05 ***
## REVOKEDYes 7.713e-01 8.955e-02 8.613 < 2e-16 ***
## URBANICITYHighly Urban/ Urban 2.334e+00 1.232e-01 18.944 < 2e-16 ***
## INCOME -7.144e-06 1.231e-06 -5.804 6.48e-09 ***
## TRAVTIME 1.594e-02 2.167e-03 7.357 1.88e-13 ***
## TIF -5.875e-02 8.605e-03 -6.827 8.69e-12 ***
## CLM_FREQ 1.746e-01 2.861e-02 6.104 1.03e-09 ***
## MVR_PTS 1.117e-01 1.620e-02 6.894 5.42e-12 ***
## CAR_AGE -1.693e-02 7.369e-03 -2.297 0.0216 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7502.7 on 6528 degrees of freedom
## Residual deviance: 5862.4 on 6505 degrees of freedom
## AIC: 5910.4
##
## Number of Fisher Scoring iterations: 5
round(exp(cbind(Estimate=coef(select.model_FLAG))),2)
## Estimate
## (Intercept) 0.05
## PARENT1Yes 1.93
## MSTATUSYes 0.58
## JOBClerical 1.17
## JOBDoctor 0.56
## JOBHome Maker 1.00
## JOBLawyer 0.74
## JOBManager 0.34
## JOBProfessional 0.74
## JOBStudent 1.09
## CAR_USEPrivate 0.48
## CAR_TYPEPanel Truck 1.47
## CAR_TYPEPickup 1.85
## CAR_TYPESports Car 2.78
## CAR_TYPESUV 2.26
## CAR_TYPEVan 1.71
## REVOKEDYes 2.16
## URBANICITYHighly Urban/ Urban 10.32
## INCOME 1.00
## TRAVTIME 1.02
## TIF 0.94
## CLM_FREQ 1.19
## MVR_PTS 1.12
## CAR_AGE 0.98
# evaluate the model by predicting using the testing set
m3_prob <- predict(select.model_FLAG, ntesting, type = "response")
m3_pclass <- ifelse(m3_prob >= 0.5, 1, 0)
# create confusion matrix
pclass <- factor(m3_pclass,levels = c(1,0))
aclass <- factor(ntesting$TARGET_FLAG,levels = c(1,0))
confusionMatrix(pclass, aclass);
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 171 111
## 0 275 1075
##
## Accuracy : 0.7635
## 95% CI : (0.7421, 0.7839)
## No Information Rate : 0.7267
## P-Value [Acc > NIR] : 0.000404
##
## Kappa : 0.3274
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3834
## Specificity : 0.9064
## Pos Pred Value : 0.6064
## Neg Pred Value : 0.7963
## Prevalence : 0.2733
## Detection Rate : 0.1048
## Detection Prevalence : 0.1728
## Balanced Accuracy : 0.6449
##
## 'Positive' Class : 1
##
# plot and show area under the curve
plot(roc(ntesting$TARGET_FLAG, m3_prob),print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# get McFadden
m3_mcFadden <- pR2(select.model_FLAG); m3_mcFadden["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.2186236
# Linear Regression - TARGET_AMT
select.model.AMT <- lm(TARGET_AMT ~.
-TARGET_FLAG
-EDUCATION
-SEX
-RED_CAR
-KIDSDRIV
-AGE
-HOMEKIDS
-YOJ
-HOME_VAL
-OLDCLAIM
-BLUEBOOK
, data = tr_prep2)
summary(select.model.AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ . - TARGET_FLAG - EDUCATION - SEX -
## RED_CAR - KIDSDRIV - AGE - HOMEKIDS - YOJ - HOME_VAL - OLDCLAIM -
## BLUEBOOK, data = tr_prep2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4919 -1694 -764 329 103709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.323e+02 2.756e+02 2.294 0.021833 *
## PARENT1Yes 7.893e+02 1.716e+02 4.600 4.29e-06 ***
## MSTATUSYes -5.333e+02 1.180e+02 -4.521 6.23e-06 ***
## JOBClerical 1.389e+01 1.908e+02 0.073 0.941986
## JOBDoctor -5.647e+02 2.861e+02 -1.974 0.048433 *
## JOBHome Maker -1.995e+02 2.438e+02 -0.818 0.413208
## JOBLawyer -1.830e+02 2.167e+02 -0.844 0.398476
## JOBManager -9.853e+02 2.118e+02 -4.653 3.32e-06 ***
## JOBProfessional -1.459e+02 1.967e+02 -0.742 0.458352
## JOBStudent -1.712e+02 2.216e+02 -0.773 0.439748
## CAR_USEPrivate -7.361e+02 1.511e+02 -4.871 1.13e-06 ***
## CAR_TYPEPanel Truck 5.732e+02 2.388e+02 2.400 0.016396 *
## CAR_TYPEPickup 3.797e+02 1.680e+02 2.260 0.023828 *
## CAR_TYPESports Car 7.770e+02 1.831e+02 4.243 2.23e-05 ***
## CAR_TYPESUV 5.240e+02 1.389e+02 3.771 0.000164 ***
## CAR_TYPEVan 6.602e+02 2.038e+02 3.239 0.001202 **
## REVOKEDYes 4.733e+02 1.550e+02 3.054 0.002266 **
## URBANICITYHighly Urban/ Urban 1.646e+03 1.392e+02 11.822 < 2e-16 ***
## INCOME -5.164e-03 1.805e-03 -2.860 0.004242 **
## TRAVTIME 1.292e+01 3.341e+00 3.866 0.000111 ***
## TIF -5.077e+01 1.295e+01 -3.921 8.89e-05 ***
## CLM_FREQ 1.235e+02 4.891e+01 2.524 0.011616 *
## MVR_PTS 1.696e+02 2.775e+01 6.115 1.01e-09 ***
## CAR_AGE -2.778e+01 1.131e+01 -2.455 0.014105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4551 on 8137 degrees of freedom
## Multiple R-squared: 0.06681, Adjusted R-squared: 0.06418
## F-statistic: 25.33 on 23 and 8137 DF, p-value: < 2.2e-16
vif(select.model.AMT)
## GVIF Df GVIF^(1/(2*Df))
## PARENT1 1.329311 1 1.152957
## MSTATUS 1.316334 1 1.147316
## JOB 4.219698 7 1.108314
## CAR_USE 2.100340 1 1.449255
## CAR_TYPE 1.803986 5 1.060775
## REVOKED 1.017562 1 1.008743
## URBANICITY 1.242415 1 1.114637
## INCOME 2.149521 1 1.466124
## TRAVTIME 1.032640 1 1.016189
## TIF 1.005153 1 1.002573
## CLM_FREQ 1.265076 1 1.124756
## MVR_PTS 1.212759 1 1.101253
## CAR_AGE 1.524066 1 1.234531
par(mfrow=c(2,2))
plot(select.model.AMT)

par(mfrow=c(1,1))
SELECT MODELS
# Read the evaluation dataset
eval_df <- read.csv("https://raw.githubusercontent.com/monuchacko/cuny_msds/master/data_621/Homework4/insurance-evaluation-data.csv", stringsAsFactors = FALSE)
# Remove columns not selected in 2nd model
#eval_df <- dplyr::select(eval_df, -YOJ, -MSTATUS, -RED_CAR)
# Convert to numeric
eval_df$INCOME <- as.numeric(gsub('[$,]', '', eval_df$INCOME))
eval_df$HOME_VAL <- as.numeric(gsub('[$,]', '', eval_df$HOME_VAL))
eval_df$BLUEBOOK <- as.numeric(gsub('[$,]', '', eval_df$BLUEBOOK))
eval_df$OLDCLAIM <- as.numeric(gsub('[$,]', '', eval_df$OLDCLAIM))
# Remove irrelevant characters
eval_df$MSTATUS <- gsub("z_", "", eval_df$MSTATUS)
eval_df$SEX <- gsub("z_", "", eval_df$SEX)
eval_df$EDUCATION <- gsub("z_", "", eval_df$EDUCATION)
eval_df$JOB <- gsub("z_", "", eval_df$JOB)
eval_df$CAR_USE <- gsub("z_", "", eval_df$CAR_USE)
eval_df$CAR_TYPE <- gsub("z_", "", eval_df$CAR_TYPE)
eval_df$URBANICITY <- gsub("z_", "", eval_df$URBANICITY)
#impute
eval_df = eval_df %>%
mutate(AGE =
ifelse(is.na(AGE),
mean(AGE, na.rm=TRUE), AGE)) %>%
mutate(YOJ =
ifelse(is.na(YOJ),
mean(YOJ, na.rm=TRUE), YOJ)) %>%
mutate(INCOME =
ifelse(is.na(INCOME),
median(INCOME, na.rm=TRUE), INCOME)) %>%
mutate(HOME_VAL =
ifelse(is.na(HOME_VAL),
mean(HOME_VAL, na.rm=TRUE), HOME_VAL)) %>%
mutate(CAR_AGE =
ifelse(is.na(CAR_AGE),
mean(CAR_AGE, na.rm=TRUE), CAR_AGE)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'PhD'),
"Doctor", JOB)) %>%
mutate(JOB =
ifelse((JOB == "" & EDUCATION == 'Masters'),
"Lawyer", JOB))
eval_prob <- predict(step.model_FLAG, eval_df, type = "response")
eval_pclass <- ifelse(eval_prob >= 0.5, 1, 0)
eval_amt <- ifelse(eval_pclass == 1, predict(step.model.AMT, eval_df, type = "response"), 0)
eval_df$TARGET_FLAG <- eval_pclass
eval_df$TARGET_AMT <- eval_amt
head(eval_df)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1 3 0 0 0 48 0 11.00000 52881 No
## 2 9 0 0 1 40 1 11.00000 50815 Yes
## 3 10 0 0 0 44 2 12.00000 43486 Yes
## 4 18 0 0 0 35 2 10.37909 21204 Yes
## 5 21 0 0 0 59 0 12.00000 87460 No
## 6 30 0 0 0 46 0 14.00000 51778 No
## HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK
## 1 0 No M Bachelors Manager 26 Private 21970
## 2 0 No M High School Manager 21 Private 18930
## 3 0 No F High School Blue Collar 30 Commercial 5900
## 4 0 No M High School Clerical 74 Private 9230
## 5 0 No M High School Manager 45 Private 15420
## 6 207519 Yes M Bachelors Professional 7 Commercial 25660
## TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 1 Van yes 0 0 No 2 10
## 2 6 Minivan no 3295 1 No 2 1
## 3 10 SUV no 0 0 No 0 10
## 4 6 Pickup no 0 0 Yes 0 4
## 5 1 Minivan yes 44857 2 No 4 1
## 6 1 Panel Truck no 2119 1 No 2 12
## URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Rural/ Rural
## 4 Highly Rural/ Rural
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban
# Export
# write.csv(eval_df,file="Insurance_Results.csv")