library(MASS)
library(pROC)
Reading in cleaned up output dataset:
ins_df <- read.csv( "https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_train.csv")[-1]
test_set <- read.csv("https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_test.csv")[-1]
df_eval <- read.csv( "https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_eval.csv")[-1]
We will try to use all the variables for our initial regression model, to identify which appear to show significance.
ins_df$TARGET_FLAG <- NULL
reg1<-lm(formula = TARGET_AMT ~ ., data = ins_df)
summary(reg1)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = ins_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5790 -1695 -749 380 83324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.460e+02 5.753e+02 0.775 0.438275
## KIDSDRIV 3.345e+02 1.217e+02 2.750 0.005982 **
## AGE 3.263e-01 7.447e+00 0.044 0.965052
## HOMEKIDS 7.088e+01 7.017e+01 1.010 0.312504
## YOJ -1.433e+01 1.587e+01 -0.903 0.366537
## INCOME 2.077e-02 2.872e-02 0.723 0.469600
## PARENT1.Yes 5.409e+02 2.146e+02 2.521 0.011736 *
## HOME_VAL -7.278e-02 3.729e-02 -1.952 0.051009 .
## MSTATUS.Yes -5.666e+02 1.427e+02 -3.970 7.26e-05 ***
## SEX.z_F -1.914e+02 1.823e+02 -1.050 0.293886
## EDUCATION..High.School 5.317e+01 1.831e+02 0.290 0.771482
## EDUCATION.Bachelors -2.093e+02 1.678e+02 -1.247 0.212351
## EDUCATION.Masters 1.424e+01 2.796e+02 0.051 0.959403
## EDUCATION.PhD 1.131e+01 3.349e+02 0.034 0.973055
## JOB.Clerical 5.534e+02 3.598e+02 1.538 0.124089
## JOB.Doctor -3.587e+02 4.333e+02 -0.828 0.407842
## JOB.Home.Maker 5.502e+02 3.729e+02 1.475 0.140155
## JOB.Lawyer 2.025e+02 3.147e+02 0.644 0.519871
## JOB.Manager -5.611e+02 3.097e+02 -1.812 0.070078 .
## JOB.Professional 3.297e+02 3.290e+02 1.002 0.316368
## JOB.Student 3.555e+02 3.896e+02 0.912 0.361611
## JOB.z_Blue.Collar 5.860e+02 3.427e+02 1.710 0.087284 .
## TRAVTIME 1.220e+01 3.452e+00 3.536 0.000409 ***
## CAR_USE.Private -6.584e+02 1.747e+02 -3.769 0.000165 ***
## BLUEBOOK 3.566e-03 6.332e-02 0.056 0.955093
## TIF -5.202e+01 1.293e+01 -4.022 5.83e-05 ***
## CAR_TYPE.Minivan -5.335e+02 1.837e+02 -2.904 0.003703 **
## CAR_TYPE.Panel.Truck -4.252e+01 2.507e+02 -0.170 0.865299
## CAR_TYPE.Sports.Car 4.227e+02 2.416e+02 1.750 0.080187 .
## CAR_TYPE.Van 4.241e+01 2.276e+02 0.186 0.852163
## CAR_TYPE.z_SUV 1.960e+02 2.060e+02 0.951 0.341559
## RED_CAR.yes 1.257e+02 1.587e+02 0.792 0.428507
## OLDCLAIM 4.431e-02 8.977e-02 0.494 0.621637
## CLM_FREQ 2.300e+01 6.777e+01 0.339 0.734354
## REVOKED.Yes 4.373e+02 1.662e+02 2.632 0.008517 **
## MVR_PTS 1.868e+02 2.794e+01 6.686 2.48e-11 ***
## CAR_AGE -2.691e+01 1.357e+01 -1.984 0.047348 *
## URBANICITY.Highly.Urban..Urban 1.735e+03 1.480e+02 11.719 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4321 on 6490 degrees of freedom
## Multiple R-squared: 0.07982, Adjusted R-squared: 0.07458
## F-statistic: 15.22 on 37 and 6490 DF, p-value: < 2.2e-16
There are a lot of variable with low signifiance and high p-value, so we will try to only leave the variable with high significance for the next model.
reg1_1<-lm(formula = TARGET_AMT ~ KIDSDRIV + PARENT1.Yes + MSTATUS.Yes + TRAVTIME + CAR_USE.Private + TIF + CAR_TYPE.Minivan + REVOKED.Yes + MVR_PTS + CAR_AGE + URBANICITY.Highly.Urban..Urban, data = ins_df)
summary(reg1_1)
##
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + PARENT1.Yes + MSTATUS.Yes +
## TRAVTIME + CAR_USE.Private + TIF + CAR_TYPE.Minivan + REVOKED.Yes +
## MVR_PTS + CAR_AGE + URBANICITY.Highly.Urban..Urban, data = ins_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5883 -1686 -800 325 83348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 991.532 236.393 4.194 2.77e-05 ***
## KIDSDRIV 385.208 109.887 3.505 0.000459 ***
## PARENT1.Yes 679.254 186.620 3.640 0.000275 ***
## MSTATUS.Yes -623.484 126.821 -4.916 9.04e-07 ***
## TRAVTIME 13.295 3.451 3.853 0.000118 ***
## CAR_USE.Private -693.697 113.927 -6.089 1.20e-09 ***
## TIF -50.771 12.933 -3.926 8.74e-05 ***
## CAR_TYPE.Minivan -615.343 124.555 -4.940 7.99e-07 ***
## REVOKED.Yes 492.546 165.595 2.974 0.002946 **
## MVR_PTS 216.220 25.723 8.406 < 2e-16 ***
## CAR_AGE -57.501 9.605 -5.987 2.26e-09 ***
## URBANICITY.Highly.Urban..Urban 1521.837 138.376 10.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4333 on 6516 degrees of freedom
## Multiple R-squared: 0.07098, Adjusted R-squared: 0.06941
## F-statistic: 45.26 on 11 and 6516 DF, p-value: < 2.2e-16
We can see improvements in R-Squared value after we narrowed the model to highly significant variables. It is however still rather low. Only 7% of the variation in the data can be explained by this model.
We will now use this regression model to predict values for the evaluation data set.
reg_pred <- predict(reg1_1, df_eval, type="response",se.fit=FALSE)
# se.fit=FALSE
df_eval$TARGET_AMT <- reg_pred
write.csv(df_eval,"eval_results.csv", row.names = FALSE)