Here, we read the training dataset into a dataframe.
insurance_tf_train <- read.csv( "https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_train.csv")[-1]
insurance_tf_test <- read.csv("https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_test.csv")[-1]
insurance_tf_eval <- read.csv( "https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_eval.csv")## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'family' will be disregarded
##
## Call:
## lm(formula = TARGET_FLAG ~ ., data = insurance_tf_train, family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.12644 -0.21596 -0.08813 0.15159 1.01921
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.0812839907 0.0446558018 1.820
## TARGET_AMT 0.0000453575 0.0000009634 47.080
## KIDSDRIV 0.0462461026 0.0094476006 4.895
## AGE -0.0007547913 0.0005779591 -1.306
## HOMEKIDS 0.0068042333 0.0054467917 1.249
## YOJ -0.0032000320 0.0012315347 -2.598
## INCOME -0.0000009293 0.0000022295 -0.417
## PARENT1.Yes 0.0581170787 0.0166629145 3.488
## HOME_VAL -0.0000079158 0.0000028951 -2.734
## MSTATUS.Yes -0.0493993599 0.0110893211 -4.455
## SEX.z_F -0.0371779803 0.0141497638 -2.627
## EDUCATION..High.School -0.0010830283 0.0142078991 -0.076
## EDUCATION.Bachelors -0.0538526754 0.0130241155 -4.135
## EDUCATION.Masters -0.0623324374 0.0217041361 -2.872
## EDUCATION.PhD -0.0533709567 0.0259896401 -2.054
## JOB.Clerical 0.0882474571 0.0279327618 3.159
## JOB.Doctor -0.0159208729 0.0336347345 -0.473
## JOB.Home.Maker 0.0938373143 0.0289466622 3.242
## JOB.Lawyer 0.0445237354 0.0244240624 1.823
## JOB.Manager -0.0209308513 0.0240431783 -0.871
## JOB.Professional 0.0538688862 0.0255375046 2.109
## JOB.Student 0.0945906802 0.0302417318 3.128
## JOB.z_Blue.Collar 0.0646676064 0.0266018409 2.431
## TRAVTIME 0.0016167830 0.0002681491 6.029
## CAR_USE.Private -0.0938252416 0.0135715785 -6.913
## BLUEBOOK 0.0000019805 0.0000049145 0.403
## TIF -0.0054604866 0.0010049305 -5.434
## CAR_TYPE.Minivan -0.0525500944 0.0142701310 -3.683
## CAR_TYPE.Panel.Truck -0.0608849891 0.0194539127 -3.130
## CAR_TYPE.Sports.Car 0.0783581726 0.0187525673 4.179
## CAR_TYPE.Van -0.0249681023 0.0176627423 -1.414
## CAR_TYPE.z_SUV 0.0511000411 0.0159927509 3.195
## RED_CAR.yes -0.0047078344 0.0123174777 -0.382
## OLDCLAIM 0.0000185758 0.0000069674 2.666
## CLM_FREQ 0.0122133660 0.0052602772 2.322
## REVOKED.Yes 0.0984225898 0.0129028106 7.628
## MVR_PTS 0.0126311528 0.0021756967 5.806
## CAR_AGE 0.0011059158 0.0010532027 1.050
## URBANICITY.Highly.Urban..Urban 0.2220865265 0.0116094981 19.130
## Pr(>|t|)
## (Intercept) 0.068769 .
## TARGET_AMT < 2e-16 ***
## KIDSDRIV 0.0000010067616183 ***
## AGE 0.191613
## HOMEKIDS 0.211630
## YOJ 0.009387 **
## INCOME 0.676824
## PARENT1.Yes 0.000490 ***
## HOME_VAL 0.006271 **
## MSTATUS.Yes 0.0000085428799672 ***
## SEX.z_F 0.008623 **
## EDUCATION..High.School 0.939241
## EDUCATION.Bachelors 0.0000359676201184 ***
## EDUCATION.Masters 0.004093 **
## EDUCATION.PhD 0.040060 *
## JOB.Clerical 0.001589 **
## JOB.Doctor 0.635982
## JOB.Home.Maker 0.001194 **
## JOB.Lawyer 0.068358 .
## JOB.Manager 0.384031
## JOB.Professional 0.034948 *
## JOB.Student 0.001769 **
## JOB.z_Blue.Collar 0.015086 *
## TRAVTIME 0.0000000017359271 ***
## CAR_USE.Private 0.0000000000051850 ***
## BLUEBOOK 0.686970
## TIF 0.0000000572060663 ***
## CAR_TYPE.Minivan 0.000233 ***
## CAR_TYPE.Panel.Truck 0.001758 **
## CAR_TYPE.Sports.Car 0.0000297250178964 ***
## CAR_TYPE.Van 0.157527
## CAR_TYPE.z_SUV 0.001404 **
## RED_CAR.yes 0.702320
## OLDCLAIM 0.007692 **
## CLM_FREQ 0.020274 *
## REVOKED.Yes 0.0000000000000273 ***
## MVR_PTS 0.0000000067180077 ***
## CAR_AGE 0.293734
## URBANICITY.Highly.Urban..Urban < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3354 on 6489 degrees of freedom
## Multiple R-squared: 0.4264, Adjusted R-squared: 0.423
## F-statistic: 126.9 on 38 and 6489 DF, p-value: < 2.2e-16
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'family' will be disregarded
##
## Call:
## lm(formula = TARGET_FLAG ~ . - AGE - HOMEKIDS - INCOME - EDUCATION..High.School -
## JOB.Doctor - JOB.Manager - BLUEBOOK - CAR_TYPE.Van - RED_CAR.yes -
## CAR_AGE, data = insurance_tf_train, family = binomial())
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.12822 -0.21445 -0.08835 0.15157 1.00693
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.0217262561 0.0282775375 0.768
## TARGET_AMT 0.0000453687 0.0000009627 47.128
## KIDSDRIV 0.0511161814 0.0085350295 5.989
## YOJ -0.0032226283 0.0011760826 -2.740
## PARENT1.Yes 0.0752521394 0.0145793903 5.162
## HOME_VAL -0.0000080258 0.0000028890 -2.778
## MSTATUS.Yes -0.0443855441 0.0107599630 -4.125
## SEX.z_F -0.0300182265 0.0119678374 -2.508
## EDUCATION.Bachelors -0.0508797252 0.0116076319 -4.383
## EDUCATION.Masters -0.0489748325 0.0181565578 -2.697
## EDUCATION.PhD -0.0417654619 0.0203586994 -2.051
## JOB.Clerical 0.1119929020 0.0188530274 5.940
## JOB.Home.Maker 0.1157225544 0.0207786402 5.569
## JOB.Lawyer 0.0571881637 0.0186026981 3.074
## JOB.Professional 0.0707776843 0.0165020074 4.289
## JOB.Student 0.1208345797 0.0224512549 5.382
## JOB.z_Blue.Collar 0.0836486699 0.0184231338 4.540
## TRAVTIME 0.0016032248 0.0002680160 5.982
## CAR_USE.Private -0.0961381367 0.0126764641 -7.584
## TIF -0.0054169409 0.0010043376 -5.394
## CAR_TYPE.Minivan -0.0461499648 0.0125947439 -3.664
## CAR_TYPE.Panel.Truck -0.0490862442 0.0179924063 -2.728
## CAR_TYPE.Sports.Car 0.0825900438 0.0180236855 4.582
## CAR_TYPE.z_SUV 0.0561864755 0.0150392713 3.736
## OLDCLAIM 0.0000184180 0.0000069651 2.644
## CLM_FREQ 0.0122058919 0.0052564836 2.322
## REVOKED.Yes 0.0990743804 0.0128964607 7.682
## MVR_PTS 0.0128011926 0.0021723650 5.893
## URBANICITY.Highly.Urban..Urban 0.2222016808 0.0115929052 19.167
## Pr(>|t|)
## (Intercept) 0.442324
## TARGET_AMT < 2e-16 ***
## KIDSDRIV 0.0000000022242853 ***
## YOJ 0.006158 **
## PARENT1.Yes 0.0000002521920493 ***
## HOME_VAL 0.005485 **
## MSTATUS.Yes 0.0000375255065348 ***
## SEX.z_F 0.012158 *
## EDUCATION.Bachelors 0.0000118735731813 ***
## EDUCATION.Masters 0.007007 **
## EDUCATION.PhD 0.040260 *
## JOB.Clerical 0.0000000029918272 ***
## JOB.Home.Maker 0.0000000265995337 ***
## JOB.Lawyer 0.002120 **
## JOB.Professional 0.0000182050370781 ***
## JOB.Student 0.0000000762026298 ***
## JOB.z_Blue.Collar 0.0000057155953880 ***
## TRAVTIME 0.0000000023237937 ***
## CAR_USE.Private 0.0000000000000382 ***
## TIF 0.0000000715168646 ***
## CAR_TYPE.Minivan 0.000250 ***
## CAR_TYPE.Panel.Truck 0.006386 **
## CAR_TYPE.Sports.Car 0.0000046846711756 ***
## CAR_TYPE.z_SUV 0.000189 ***
## OLDCLAIM 0.008205 **
## CLM_FREQ 0.020260 *
## REVOKED.Yes 0.0000000000000179 ***
## MVR_PTS 0.0000000039884910 ***
## URBANICITY.Highly.Urban..Urban < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3354 on 6499 degrees of freedom
## Multiple R-squared: 0.4255, Adjusted R-squared: 0.4231
## F-statistic: 171.9 on 28 and 6499 DF, p-value: < 2.2e-16
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'family' will be disregarded
##
## Call:
## lm(formula = TARGET_FLAG ~ . - AGE - HOMEKIDS - INCOME - EDUCATION..High.School -
## JOB.Doctor - JOB.Manager - BLUEBOOK - CAR_TYPE.Van - RED_CAR.yes -
## CAR_AGE - SEX.z_F - EDUCATION.PhD - CLM_FREQ - EDUCATION.Masters -
## JOB.Lawyer, data = insurance_tf_train, family = "binomial")
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.13576 -0.21550 -0.08907 0.15485 1.02217
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -0.0098085789 0.0245835358 -0.399
## TARGET_AMT 0.0000455274 0.0000009635 47.252
## KIDSDRIV 0.0505607726 0.0085401865 5.920
## YOJ -0.0032484649 0.0011774987 -2.759
## PARENT1.Yes 0.0745114245 0.0145641244 5.116
## HOME_VAL -0.0000087945 0.0000028722 -3.062
## MSTATUS.Yes -0.0435529210 0.0107342682 -4.057
## EDUCATION.Bachelors -0.0384696657 0.0100258065 -3.837
## JOB.Clerical 0.1277049523 0.0139677473 9.143
## JOB.Home.Maker 0.1140151987 0.0185145394 6.158
## JOB.Professional 0.0731743088 0.0144558459 5.062
## JOB.Student 0.1395842141 0.0187143407 7.459
## JOB.z_Blue.Collar 0.1034511316 0.0138668271 7.460
## TRAVTIME 0.0016384605 0.0002681343 6.111
## CAR_USE.Private -0.0882894522 0.0124855116 -7.071
## TIF -0.0055166854 0.0010051066 -5.489
## CAR_TYPE.Minivan -0.0476669882 0.0125195897 -3.807
## CAR_TYPE.Panel.Truck -0.0479460220 0.0177645928 -2.699
## CAR_TYPE.Sports.Car 0.0625932451 0.0158753382 3.943
## CAR_TYPE.z_SUV 0.0361057099 0.0124225754 2.906
## OLDCLAIM 0.0000291669 0.0000053694 5.432
## REVOKED.Yes 0.1023253282 0.0128846281 7.942
## MVR_PTS 0.0138566593 0.0021407076 6.473
## URBANICITY.Highly.Urban..Urban 0.2230700429 0.0115136139 19.374
## Pr(>|t|)
## (Intercept) 0.689914
## TARGET_AMT < 2e-16 ***
## KIDSDRIV 3.38e-09 ***
## YOJ 0.005818 **
## PARENT1.Yes 3.21e-07 ***
## HOME_VAL 0.002208 **
## MSTATUS.Yes 5.02e-05 ***
## EDUCATION.Bachelors 0.000126 ***
## JOB.Clerical < 2e-16 ***
## JOB.Home.Maker 7.80e-10 ***
## JOB.Professional 4.26e-07 ***
## JOB.Student 9.88e-14 ***
## JOB.z_Blue.Collar 9.76e-14 ***
## TRAVTIME 1.05e-09 ***
## CAR_USE.Private 1.69e-12 ***
## TIF 4.20e-08 ***
## CAR_TYPE.Minivan 0.000142 ***
## CAR_TYPE.Panel.Truck 0.006974 **
## CAR_TYPE.Sports.Car 8.14e-05 ***
## CAR_TYPE.z_SUV 0.003668 **
## OLDCLAIM 5.77e-08 ***
## REVOKED.Yes 2.33e-15 ***
## MVR_PTS 1.03e-10 ***
## URBANICITY.Highly.Urban..Urban < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3359 on 6504 degrees of freedom
## Multiple R-squared: 0.4232, Adjusted R-squared: 0.4212
## F-statistic: 207.5 on 23 and 6504 DF, p-value: < 2.2e-16
roc(TARGET_FLAG~model1$fitted.values, data = insurance_tf_train,plot = TRUE, main = "ROC CURVE", col= "blue",
percent=TRUE,
ci = TRUE, # compute AUC (of AUC by default)
print.auc = TRUE)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = TARGET_FLAG ~ model1$fitted.values, data = insurance_tf_train, plot = TRUE, main = "ROC CURVE", col = "blue", percent = TRUE, ci = TRUE, print.auc = TRUE)
##
## Data: model1$fitted.values in 4796 controls (TARGET_FLAG 0) < 1732 cases (TARGET_FLAG 1).
## Area under the curve: 95.28%
## 95% CI: 94.74%-95.82% (DeLong)
roc(TARGET_FLAG~model2$fitted.values, data = insurance_tf_train,plot = TRUE, main = "ROC CURVE", col= "blue",
percent=TRUE,
ci = TRUE, # compute AUC (of AUC by default)
print.auc = TRUE)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = TARGET_FLAG ~ model2$fitted.values, data = insurance_tf_train, plot = TRUE, main = "ROC CURVE", col = "blue", percent = TRUE, ci = TRUE, print.auc = TRUE)
##
## Data: model2$fitted.values in 4796 controls (TARGET_FLAG 0) < 1732 cases (TARGET_FLAG 1).
## Area under the curve: 95.25%
## 95% CI: 94.72%-95.79% (DeLong)
roc(TARGET_FLAG~model3$fitted.values, data = insurance_tf_train,plot = TRUE, main = "ROC CURVE", col= "blue",
percent=TRUE,
ci = TRUE, # compute AUC (of AUC by default)
print.auc = TRUE)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = TARGET_FLAG ~ model3$fitted.values, data = insurance_tf_train, plot = TRUE, main = "ROC CURVE", col = "blue", percent = TRUE, ci = TRUE, print.auc = TRUE)
##
## Data: model3$fitted.values in 4796 controls (TARGET_FLAG 0) < 1732 cases (TARGET_FLAG 1).
## Area under the curve: 95.23%
## 95% CI: 94.69%-95.77% (DeLong)
Based the fact that the area under the curve for model 2 and model 3 are virtually identical. I am going to select model2 Because Auc value is little bit higher than other.