Data Preparation

##   KIDSDRIV TARGET_AMT
## 1        0   5658.933
## 2        1   6219.881
## 3        2   5541.682
## 4        3   4915.414
## 5        4   4054.000

##    AGEBIN TARGET_AMT
## 1  (0,57]   5598.432
## 2 (57,82]   6825.734

##   HOMEKIDS TARGET_AMT
## 1        0   5685.224
## 2        1   5522.436
## 3        2   6085.350
## 4        3   5431.659
## 5        4   5609.528
## 6        5   5009.000
##   HK0 TARGET_AMT
## 1   0   5722.475
## 2   1   5685.224

##   No_Income TARGET_AMT
## 1         0   5840.581
## 2         1   5052.280

##   PARENT1 TARGET_AMT
## 1      No   5603.351
## 2     Yes   6050.365

##   LHV TARGET_AMT
## 1   0   5833.665
## 2   1   5674.895

##   MSTATUS TARGET_AMT
## 1     Yes   5425.837
## 2    z_No   5966.716

##   SEX TARGET_AMT
## 1   M   6146.700
## 2 z_F   5343.804

##       EDUCATION TARGET_AMT
## 1  <High School   5678.822
## 2     Bachelors   5882.672
## 3       Masters   5966.203
## 4           PhD   6623.198
## 5 z_High School   5340.430
##   HS TARGET_AMT
## 1  0   5913.112
## 2  1   5340.430

## [1] 0.01145817
##   TRAVBIN TARGET_AMT
## 1      20   8993.472
## 2      30   5160.282
## 3      40   5663.291
## 4      50   5527.179
## 5      60   6109.261
## 6      70   6642.549
## 7      80   4284.250
## [1] -0.04024829
## [1] 0.0263967
## [1] 0.005176834

##      CAR_USE TARGET_AMT
## 1 Commercial   6098.789
## 2    Private   5326.728

## [1] 0.02359552
## [1] 0.033044
##   BLUEBOOKBIN TARGET_AMT
## 1           0   5261.043
## 2         0.5   6089.899
## 3           1   6889.024
## 4         1.5   5808.918
## 5           2   5085.294
## 6         2.5   5675.657
## [1] 0.02429707
## [1] 0.02816773

##      CAR_TYPE TARGET_AMT
## 1     Minivan   5601.665
## 2 Panel Truck   7464.703
## 3      Pickup   5430.106
## 4  Sports Car   5412.733
## 5         Van   6908.553
## 6       z_SUV   5241.104
##   LOWRISKCAR TARGET_AMT
## 1          0   7169.753
## 2          1   5388.646

##   RED_CAR TARGET_AMT
## 1      no   5568.224
## 2     yes   6036.419

## [1] 0.005805864

##   REVOKED TARGET_AMT
## 1      No   5847.834
## 2     Yes   5139.949

## [1] 0.03981117

## [1] -0.01573614

##              URBANICITY TARGET_AMT
## 1   Highly Urban/ Urban   5711.058
## 2 z_Highly Rural/ Rural   5544.839
## Warning in par(fig = c(0, 0.8, 0, 0.8), new = TRUE): calling par(new=TRUE)
## with no plot

## [1] 0.001960785

##      Variable Pearson PVal
## 1  TARGET_AMT    1.00 0.00
## 2    KIDSDRIV    0.02 0.65
## 3         AGE    0.07 0.10
## 4    HOMEKIDS   -0.01 0.90
## 5         YOJ    0.07 0.12
## 6      INCOME    0.02 0.67
## 7     PARENT1    0.05 0.26
## 8    HOME_VAL   -0.02 0.59
## 9     MSTATUS    0.13 0.00
## 10        SEX    0.05 0.25
## 11   TRAVTIME    0.03 0.46
## 12    CAR_USE   -0.03 0.49
## 13   BLUEBOOK   -0.03 0.51
## 14    RED_CAR   -0.07 0.12
## 15   OLDCLAIM   -0.03 0.41
## 16   CLM_FREQ   -0.01 0.87
## 17    REVOKED   -0.06 0.15
## 18 URBANICITY    0.04 0.34
## 19        HK0    0.00 0.95
## 20  No_Income   -0.11 0.01
## 21    TRAVBIN    0.06 0.16
## 22     PICKUP   -0.02 0.62
## 23  SR_CARAGE   -0.02 0.64

## Warning: Removed 12 rows containing missing values (position_stack).

## [1] 0.3840836        NA
## [1] 0.003522819          NA
## [1] 0.04073636         NA
## [1] 0.2208314        NA
## 
## Call:
## lm(formula = TARGET_AMT ~ BLUEBOOKBIN, data = traini[traini$TARGET_FLAG == 
##     1, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6730  -3130  -1584    207 101496 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      5261.0      328.4  16.020 < 0.0000000000000002 ***
## BLUEBOOKBIN0.5    828.9      513.7   1.614              0.10676    
## BLUEBOOKBIN1     1628.0      556.5   2.925              0.00348 ** 
## BLUEBOOKBIN1.5    547.9      682.4   0.803              0.42212    
## BLUEBOOKBIN2     -175.7      497.1  -0.354              0.72372    
## BLUEBOOKBIN2.5    414.6      540.6   0.767              0.44320    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7730 on 2147 degrees of freedom
## Multiple R-squared:  0.005795,   Adjusted R-squared:  0.00348 
## F-statistic: 2.503 on 5 and 2147 DF,  p-value: 0.02869
## 
## Call:
## glm(formula = TARGET_FLAG ~ EDUCATION, data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3403  -0.3200  -0.1972   0.6597   0.8283  
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)
## (Intercept)             0.32003    0.01258  25.447 < 0.0000000000000002
## EDUCATIONBachelors     -0.08676    0.01559  -5.565    0.000000027028342
## EDUCATIONMasters       -0.12281    0.01652  -7.433    0.000000000000116
## EDUCATIONPhD           -0.14833    0.02048  -7.242    0.000000000000484
## EDUCATIONz_High School  0.02031    0.01549   1.311                 0.19
##                           
## (Intercept)            ***
## EDUCATIONBachelors     ***
## EDUCATIONMasters       ***
## EDUCATIONPhD           ***
## EDUCATIONz_High School    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1902816)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1551.9  on 8156  degrees of freedom
## AIC: 9625.8
## 
## Number of Fisher Scoring iterations: 2
## 
## Call:
## glm(formula = TARGET_FLAG ~ CAR_TYPE, data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3352  -0.2955  -0.2633   0.6648   0.8373  
## 
## Coefficients:
##                     Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         0.162704   0.009419  17.274 < 0.0000000000000002 ***
## CAR_TYPEPanel Truck 0.100610   0.019241   5.229         0.0000001748 ***
## CAR_TYPEPickup      0.156231   0.015024  10.399 < 0.0000000000000002 ***
## CAR_TYPESports Car  0.172467   0.017278   9.982 < 0.0000000000000002 ***
## CAR_TYPEVan         0.105296   0.018506   5.690         0.0000000131 ***
## CAR_TYPEz_SUV       0.132850   0.013102  10.139 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.190302)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1551.9  on 8155  degrees of freedom
## AIC: 9627.6
## 
## Number of Fisher Scoring iterations: 2
## 
## Call:
## glm(formula = TARGET_FLAG ~ AGEBIN, data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3408  -0.2584  -0.2584   0.6592   0.7416  
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   0.258424   0.005041  51.260 < 0.0000000000000002 ***
## AGEBIN(57,82] 0.082400   0.019709   4.181            0.0000293 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1938493)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1581.6  on 8159  degrees of freedom
## AIC: 9774.4
## 
## Number of Fisher Scoring iterations: 2
## 
## Call:
## glm(formula = TARGET_FLAG ~ as.factor(BLUEBOOKBIN), data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3282  -0.2605  -0.2327   0.6718   0.8035  
## 
## Coefficients:
##                            Estimate Std. Error t value
## (Intercept)                0.260461   0.009518  27.364
## as.factor(BLUEBOOKBIN)0.5 -0.027775   0.014411  -1.927
## as.factor(BLUEBOOKBIN)1   -0.011512   0.015896  -0.724
## as.factor(BLUEBOOKBIN)1.5 -0.063990   0.017813  -3.592
## as.factor(BLUEBOOKBIN)2    0.067772   0.015429   4.393
## as.factor(BLUEBOOKBIN)2.5  0.050480   0.016599   3.041
##                                       Pr(>|t|)    
## (Intercept)               < 0.0000000000000002 ***
## as.factor(BLUEBOOKBIN)0.5              0.05397 .  
## as.factor(BLUEBOOKBIN)1                0.46895    
## as.factor(BLUEBOOKBIN)1.5              0.00033 ***
## as.factor(BLUEBOOKBIN)2              0.0000113 ***
## as.factor(BLUEBOOKBIN)2.5              0.00236 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1927076)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1571.5  on 8155  degrees of freedom
## AIC: 9730.2
## 
## Number of Fisher Scoring iterations: 2
##  [1] "INDEX"       "TARGET_FLAG" "TARGET_AMT"  "KIDSDRIV"    "AGE"        
##  [6] "HOMEKIDS"    "YOJ"         "INCOME"      "PARENT1"     "HOME_VAL"   
## [11] "MSTATUS"     "SEX"         "EDUCATION"   "JOB"         "TRAVTIME"   
## [16] "CAR_USE"     "BLUEBOOK"    "TIF"         "CAR_TYPE"    "RED_CAR"    
## [21] "OLDCLAIM"    "CLM_FREQ"    "REVOKED"     "MVR_PTS"     "CAR_AGE"    
## [26] "URBANICITY"  "AGEBIN"      "HK0"         "No_Income"   "LHV"        
## [31] "HS"          "TRAVBIN"     "BLUEBOOKBIN" "PICKUP"      "SC"         
## [36] "minivan"     "SUV"         "LOWRISKCAR"  "LOGOLDCLAIM" "SR_CARAGE"  
## [41] "CLAIM"       "FAMILY"

CORRPLOTS

traini$AGEBIN<-as.numeric(traini$AGEBIN)
traini$TRAVBIN<-as.numeric(traini$TRAVBIN)
trainsub<-traini[traini$TARGET_FLAG==1,]
cormat_amt<-as.matrix(cor(trainsub[c(3,4,7,9,11,12,16,17,18,20,22,23,24,26,27,28,29,30,32,38,39,40)]))
corrplot(cormat_amt,method = "color",tl.cex=.7)

cormat_flag<-as.matrix(cor(traini[c(2,4,7,9,11,12,16,17,18,20,22,23,24,26,27,28,29,30,32,38,39,40)]))
corrplot(cormat_flag,method = "color",tl.cex=.7)

Model Building

## 
## Call:
## lm(formula = TARGET_AMT ~ MSTATUS + AGEBIN + No_Income + REVOKED + 
##     CAR_TYPE + BLUEBOOKBIN + as.numeric(TRAVBIN) + MVR_PTS + 
##     LOGOLDCLAIM + CAR_USE + HK0 + HS, data = traini[traini$TARGET_FLAG == 
##     1, ], weights = wts)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7382 -0.8879 -0.4518  0.1214 29.2820 
## 
## Coefficients:
##                     Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)          3587.22    1349.34   2.658     0.00791 ** 
## MSTATUS               279.22     281.08   0.993     0.32064    
## AGEBIN                941.22     811.81   1.159     0.24642    
## No_Income           -1510.65     283.34  -5.332 0.000000108 ***
## REVOKED              -464.24     270.00  -1.719     0.08569 .  
## CAR_TYPEPanel Truck  1948.11    1015.66   1.918     0.05524 .  
## CAR_TYPEPickup       1196.94     492.66   2.430     0.01520 *  
## CAR_TYPESports Car  -1064.92     477.33  -2.231     0.02578 *  
## CAR_TYPEVan          1052.48     863.62   1.219     0.22310    
## CAR_TYPEz_SUV        -388.70     425.78  -0.913     0.36139    
## BLUEBOOKBIN0.5       -120.41     506.16  -0.238     0.81199    
## BLUEBOOKBIN1          527.38     665.06   0.793     0.42787    
## BLUEBOOKBIN1.5      -1218.07     469.63  -2.594     0.00956 ** 
## BLUEBOOKBIN2         -631.57     346.46  -1.823     0.06845 .  
## BLUEBOOKBIN2.5        176.29     467.73   0.377     0.70628    
## as.numeric(TRAVBIN)    20.18      16.55   1.219     0.22299    
## MVR_PTS               264.26     134.53   1.964     0.04963 *  
## LOGOLDCLAIM           262.60     135.79   1.934     0.05326 .  
## CAR_USE               232.68     326.98   0.712     0.47678    
## HK0                  -193.09     287.97  -0.671     0.50259    
## HS                    132.14     276.61   0.478     0.63292    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.165 on 2132 degrees of freedom
## Multiple R-squared:  0.2122, Adjusted R-squared:  0.2048 
## F-statistic: 28.71 on 20 and 2132 DF,  p-value: < 0.00000000000000022
##                         GVIF Df GVIF^(1/(2*Df))
## MSTATUS             1.566579  1        1.251631
## AGEBIN              1.213132  1        1.101423
## No_Income           2.875327  1        1.695679
## REVOKED             2.410994  1        1.552738
## CAR_TYPE            7.650203  5        1.225652
## BLUEBOOKBIN         4.889756  5        1.172003
## as.numeric(TRAVBIN) 1.944586  1        1.394484
## MVR_PTS             1.772381  1        1.331308
## LOGOLDCLAIM         1.581929  1        1.257748
## CAR_USE             2.011945  1        1.418431
## HK0                 1.804389  1        1.343276
## HS                  2.346703  1        1.531895
## [1] -0.02484418
## 
## Call:
## lm(formula = TARGET_AMT ~ SEX + No_Income + REVOKED + LOWRISKCAR + 
##     BLUEBOOKBIN + as.numeric(TRAVBIN) + MVR_PTS, data = traini[traini$TARGET_FLAG == 
##     1, ], weights = wts2)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3508 -0.8843 -0.4282  0.1325 30.3721 
## 
## Coefficients:
##                     Estimate Std. Error t value          Pr(>|t|)    
## (Intercept)          8087.41    1097.65   7.368 0.000000000000246 ***
## SEX                 -1417.76     294.90  -4.808 0.000001633217490 ***
## No_Income           -1753.32     264.41  -6.631 0.000000000042029 ***
## REVOKED              -537.23     256.37  -2.096           0.03624 *  
## LOWRISKCAR           -503.81     649.33  -0.776           0.43790    
## BLUEBOOKBIN0.5       -140.94     497.14  -0.283           0.77682    
## BLUEBOOKBIN1          674.23     660.94   1.020           0.30779    
## BLUEBOOKBIN1.5      -1049.77     436.64  -2.404           0.01629 *  
## BLUEBOOKBIN2         -675.38     323.57  -2.087           0.03698 *  
## BLUEBOOKBIN2.5        220.48     462.73   0.476           0.63378    
## as.numeric(TRAVBIN)    24.92      14.26   1.748           0.08065 .  
## MVR_PTS               397.06     124.83   3.181           0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.168 on 2141 degrees of freedom
## Multiple R-squared:  0.2068, Adjusted R-squared:  0.2027 
## F-statistic: 50.74 on 11 and 2141 DF,  p-value: < 0.00000000000000022
##                         GVIF Df GVIF^(1/(2*Df))
## SEX                 1.580500  1        1.257179
## No_Income           2.497251  1        1.580269
## REVOKED             2.167957  1        1.472398
## LOWRISKCAR          1.309848  1        1.144486
## BLUEBOOKBIN         2.782307  5        1.107747
## as.numeric(TRAVBIN) 1.438548  1        1.199395
## MVR_PTS             1.521918  1        1.233660
## [1] -0.01506344
BINARY OUTCOME
## Loading required package: lattice
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + PARENT1 + MSTATUS + JOB + 
##     as.numeric(TRAVBIN) + CAR_USE + CAR_TYPE + REVOKED + AGEBIN + 
##     BLUEBOOKBIN + HK0 + LHV + SR_CARAGE + HS + CLAIM + TIF + 
##     AGEBIN, data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8589  -0.2832  -0.1311   0.3264   1.1382  
## 
## Coefficients:
##                       Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         -0.1491357  0.0512765  -2.908             0.003642 ** 
## KIDSDRIV             0.0553152  0.0097676   5.663    0.000000015369093 ***
## PARENT1              0.0603262  0.0191043   3.158             0.001596 ** 
## MSTATUS              0.0782673  0.0115750   6.762    0.000000000014574 ***
## JOBClerical          0.0409642  0.0247408   1.656             0.097814 .  
## JOBDoctor           -0.0365510  0.0332723  -1.099             0.272001    
## JOBHome Maker        0.0540337  0.0269235   2.007             0.044790 *  
## JOBLawyer            0.0258082  0.0252423   1.022             0.306614    
## JOBManager          -0.0550561  0.0234567  -2.347             0.018942 *  
## JOBProfessional      0.0098014  0.0231974   0.423             0.672655    
## JOBStudent           0.0615722  0.0262337   2.347             0.018946 *  
## JOBz_Blue Collar     0.0514954  0.0229218   2.247             0.024694 *  
## as.numeric(TRAVBIN) -0.0025748  0.0006354  -4.052    0.000051195952883 ***
## CAR_USE             -0.0950533  0.0140000  -6.790    0.000000000012042 ***
## CAR_TYPEPanel Truck  0.0437222  0.0227315   1.923             0.054462 .  
## CAR_TYPEPickup       0.0758939  0.0153132   4.956    0.000000733729598 ***
## CAR_TYPESports Car   0.1142615  0.0164744   6.936    0.000000000004352 ***
## CAR_TYPEVan          0.0646369  0.0187486   3.448             0.000569 ***
## CAR_TYPEz_SUV        0.0853386  0.0124826   6.837    0.000000000008697 ***
## REVOKED              0.1513386  0.0136205  11.111 < 0.0000000000000002 ***
## AGEBIN               0.1481709  0.0206609   7.172    0.000000000000807 ***
## BLUEBOOKBIN0.5      -0.0082901  0.0139869  -0.593             0.553397    
## BLUEBOOKBIN1        -0.0015668  0.0153939  -0.102             0.918935    
## BLUEBOOKBIN1.5      -0.0480574  0.0178381  -2.694             0.007073 ** 
## BLUEBOOKBIN2         0.0386424  0.0146548   2.637             0.008384 ** 
## BLUEBOOKBIN2.5       0.0018004  0.0155184   0.116             0.907644    
## HK0                 -0.0154104  0.0140616  -1.096             0.273145    
## LHV                  0.0438774  0.0110912   3.956    0.000076844042579 ***
## SR_CARAGE           -0.0092806  0.0053953  -1.720             0.085449 .  
## HS                   0.0324414  0.0113404   2.861             0.004238 ** 
## CLAIM                0.0662625  0.0030683  21.595 < 0.0000000000000002 ***
## TIF                 -0.0308475  0.0044481  -6.935    0.000000000004374 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1604579)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1304.4  on 8129  degrees of freedom
## AIC: 8261.5
## 
## Number of Fisher Scoring iterations: 2

## 
## Call:
## roc.default(response = traini$TARGET_FLAG, predictor = gmod1a$fitted.values,     plot = TRUE)
## 
## Data: gmod1a$fitted.values in 6008 controls (traini$TARGET_FLAG 0) < 2153 cases (traini$TARGET_FLAG 1).
## Area under the curve: 0.7694
##                         GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV            1.269559  1        1.126747
## PARENT1             2.126426  1        1.458227
## MSTATUS             1.635869  1        1.279011
## JOB                 3.529538  8        1.082013
## as.numeric(TRAVBIN) 1.702065  1        1.304632
## CAR_USE             2.326691  1        1.525350
## CAR_TYPE            2.758824  5        1.106808
## REVOKED             1.014503  1        1.007225
## AGEBIN              1.327672  1        1.152246
## BLUEBOOKBIN         1.712618  5        1.055276
## HK0                 2.293618  1        1.514470
## LHV                 1.235122  1        1.111360
## SR_CARAGE           1.480344  1        1.216694
## HS                  1.334302  1        1.155120
## CLAIM               1.042723  1        1.021138
## TIF                 1.006188  1        1.003089
## [1] 0.9479028
## [1] 0.2837901
## [1] 0.7869283
## [1] 0.6612554
## [1] 0.8599471
## 
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + PARENT1 + MSTATUS + JOB + 
##     logTRAVTIME + CAR_USE + TIF + LOWRISKCAR + REVOKED + AGEBIN + 
##     BLUEBOOKBIN + HK0 + LHV + HS + CLAIM + YOJ + JOB + TIF, data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8788  -0.2796  -0.1322   0.3343   1.1072  
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      -0.149463   0.045280  -3.301             0.000968 ***
## KIDSDRIV          0.047439   0.009673   4.904    0.000000956085761 ***
## PARENT1           0.060813   0.019172   3.172             0.001519 ** 
## MSTATUS           0.076676   0.011652   6.580    0.000000000049832 ***
## JOBClerical       0.056479   0.024020   2.351             0.018730 *  
## JOBDoctor        -0.039403   0.033187  -1.187             0.235143    
## JOBHome Maker     0.057585   0.027430   2.099             0.035816 *  
## JOBLawyer         0.020322   0.025136   0.808             0.418831    
## JOBManager       -0.049071   0.023383  -2.099             0.035888 *  
## JOBProfessional   0.014559   0.022984   0.633             0.526448    
## JOBStudent        0.057749   0.026276   2.198             0.027994 *  
## JOBz_Blue Collar  0.060775   0.021815   2.786             0.005350 ** 
## logTRAVTIME       0.021523   0.004474   4.811    0.000001528628645 ***
## CAR_USE          -0.095507   0.012914  -7.395    0.000000000000155 ***
## TIF              -0.030040   0.004462  -6.732    0.000000000017839 ***
## LOWRISKCAR       -0.001358   0.014685  -0.092             0.926343    
## REVOKED           0.156366   0.013653  11.453 < 0.0000000000000002 ***
## AGEBIN            0.129974   0.018424   7.055    0.000000000001872 ***
## BLUEBOOKBIN0.5   -0.009783   0.013736  -0.712             0.476346    
## BLUEBOOKBIN1     -0.005492   0.015400  -0.357             0.721402    
## BLUEBOOKBIN1.5   -0.054918   0.017355  -3.164             0.001560 ** 
## BLUEBOOKBIN2      0.053751   0.014490   3.709             0.000209 ***
## BLUEBOOKBIN2.5    0.024493   0.015316   1.599             0.109823    
## HK0              -0.047653   0.013176  -3.617             0.000300 ***
## LHV               0.046542   0.011124   4.184    0.000028964549069 ***
## HS                0.037845   0.011030   3.431             0.000604 ***
## CLAIM             0.068156   0.003070  22.204 < 0.0000000000000002 ***
## YOJ              -0.003964   0.001294  -3.063             0.002199 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.161485)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1313.4  on 8133  degrees of freedom
## AIC: 8309.6
## 
## Number of Fisher Scoring iterations: 2

## 
## Call:
## roc.default(response = traini$TARGET_FLAG, predictor = gmod2a$fitted.values,     plot = TRUE)
## 
## Data: gmod2a$fitted.values in 6008 controls (traini$TARGET_FLAG 0) < 2153 cases (traini$TARGET_FLAG 1).
## Area under the curve: 0.7646
##                 GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV    1.237181  1        1.112286
## PARENT1     2.127820  1        1.458705
## MSTATUS     1.647250  1        1.283453
## JOB         3.434417  8        1.080167
## logTRAVTIME 1.011340  1        1.005654
## CAR_USE     1.967227  1        1.402579
## TIF         1.006105  1        1.003048
## LOWRISKCAR  1.571636  1        1.253649
## REVOKED     1.012798  1        1.006379
## AGEBIN      1.048995  1        1.024204
## BLUEBOOKBIN 1.345821  5        1.030146
## HK0         2.000937  1        1.414545
## LHV         1.234612  1        1.111131
## HS          1.254312  1        1.119961
## CLAIM       1.036903  1        1.018284
## YOJ         1.362909  1        1.167437
## [1] 0.9495672
## [1] 0.270785
## [1] 0.7841924
## [1] 0.6580135
## [1] 0.8589927
## 
## Call:
## glm(formula = TARGET_FLAG ~ FAMILY + MSTATUS + JOB + logTRAVTIME + 
##     LOWRISKCAR + TIF + REVOKED + AGEBIN + BLUEBOOKBIN + HS + 
##     CLAIM + YOJ + JOB + TIF + LHV, data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8177  -0.2798  -0.1339   0.3414   1.0980  
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      -0.242454   0.038338  -6.324     0.00000000026825 ***
## FAMILY           -0.049373   0.004507 -10.955 < 0.0000000000000002 ***
## MSTATUS           0.082565   0.009697   8.514 < 0.0000000000000002 ***
## JOBClerical       0.008993   0.023189   0.388              0.69818    
## JOBDoctor        -0.101678   0.032205  -3.157              0.00160 ** 
## JOBHome Maker     0.002529   0.026478   0.096              0.92390    
## JOBLawyer        -0.038761   0.023901  -1.622              0.10490    
## JOBManager       -0.091437   0.022732  -4.022     0.00005812644403 ***
## JOBProfessional  -0.027300   0.022336  -1.222              0.22166    
## JOBStudent        0.040474   0.026191   1.545              0.12230    
## JOBz_Blue Collar  0.067208   0.021864   3.074              0.00212 ** 
## logTRAVTIME       0.021676   0.004486   4.832     0.00000137904911 ***
## LOWRISKCAR       -0.039355   0.013792  -2.853              0.00434 ** 
## TIF              -0.030065   0.004476  -6.717     0.00000000001983 ***
## REVOKED           0.157706   0.013694  11.516 < 0.0000000000000002 ***
## AGEBIN            0.130824   0.018440   7.095     0.00000000000141 ***
## BLUEBOOKBIN0.5   -0.014879   0.013763  -1.081              0.27970    
## BLUEBOOKBIN1     -0.002223   0.015442  -0.144              0.88556    
## BLUEBOOKBIN1.5   -0.043498   0.017341  -2.508              0.01215 *  
## BLUEBOOKBIN2      0.058643   0.014513   4.041     0.00005376680693 ***
## BLUEBOOKBIN2.5    0.026799   0.015362   1.745              0.08110 .  
## HS                0.048659   0.010969   4.436     0.00000927695885 ***
## CLAIM             0.069569   0.003072  22.644 < 0.0000000000000002 ***
## YOJ              -0.004118   0.001294  -3.182              0.00147 ** 
## LHV               0.043610   0.011151   3.911     0.00009269791513 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1625214)
## 
##     Null deviance: 1585.0  on 8160  degrees of freedom
## Residual deviance: 1322.3  on 8136  degrees of freedom
## AIC: 8358.8
## 
## Number of Fisher Scoring iterations: 2

## 
## Call:
## roc.default(response = traini$TARGET_FLAG, predictor = gmod3a$fitted.values,     plot = TRUE)
## 
## Data: gmod3a$fitted.values in 6008 controls (traini$TARGET_FLAG 0) < 2153 cases (traini$TARGET_FLAG 1).
## Area under the curve: 0.7609
##                 GVIF Df GVIF^(1/(2*Df))
## FAMILY      1.075084  1        1.036863
## MSTATUS     1.133553  1        1.064684
## JOB         2.201279  8        1.050551
## logTRAVTIME 1.010519  1        1.005246
## LOWRISKCAR  1.377429  1        1.173639
## TIF         1.005986  1        1.002988
## REVOKED     1.012507  1        1.006234
## AGEBIN      1.044109  1        1.021817
## BLUEBOOKBIN 1.317160  5        1.027931
## HS          1.232395  1        1.110133
## CLAIM       1.032176  1        1.015961
## YOJ         1.354352  1        1.163766
## LHV         1.232619  1        1.110234
## [1] 0.9464048
## [1] 0.262889
## [1] 0.7817957
## [1] 0.6373874
## [1] 0.8562608

Try models out on evaluation data

## Warning: NAs introduced by coercion

Predictions

### First we need to predict who is likely to have a claim.

exp2$TARGET_FLAG<-round(predict.glm(gmod2a,newdata = exp2))

sum(exp2$TARGET_FLAG)
## [1] 50
LogPredAmounts<-(predict.lm(mod2b,newdata = exp2[exp2$TARGET_FLAG==1,])-17135)/fac+8



summary(LogPredAmounts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.778   4.370   4.645   4.610   4.874   5.382
sd(LogPredAmounts)
## [1] 0.3693754
par(mfrow=c(2,1))
hist(traini$LOG_AMT, main = "Training Log(Amounts)",xlim=c(4,12))
hist(LogPredAmounts,main = "Experiment Log(Predicted Amounts)",xlim=c(4,12))

#My distribution is less tight around the mean but also has fewer outliers