library(MASS)
library(pROC)

Reading in cleaned up output dataset:

ins_df <- read.csv( "https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_train.csv")[-1]
test_set <- read.csv("https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_test.csv")[-1]
df_eval <- read.csv( "https://raw.githubusercontent.com/charlsjoseph/Data621/master/Data621-Assignment4/insurance_tf_eval.csv")[-1]

Multiple Linear Regression

Model 1

We will try to use all the variables for our initial regression model, to identify which appear to show significance.

ins_df$TARGET_FLAG <- NULL
reg1<-lm(formula = TARGET_AMT ~ ., data = ins_df)
summary(reg1)
## 
## Call:
## lm(formula = TARGET_AMT ~ ., data = ins_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5790  -1695   -749    380  83324 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.460e+02  5.753e+02   0.775 0.438275    
## KIDSDRIV                        3.345e+02  1.217e+02   2.750 0.005982 ** 
## AGE                             3.263e-01  7.447e+00   0.044 0.965052    
## HOMEKIDS                        7.088e+01  7.017e+01   1.010 0.312504    
## YOJ                            -1.433e+01  1.587e+01  -0.903 0.366537    
## INCOME                          2.077e-02  2.872e-02   0.723 0.469600    
## PARENT1.Yes                     5.409e+02  2.146e+02   2.521 0.011736 *  
## HOME_VAL                       -7.278e-02  3.729e-02  -1.952 0.051009 .  
## MSTATUS.Yes                    -5.666e+02  1.427e+02  -3.970 7.26e-05 ***
## SEX.z_F                        -1.914e+02  1.823e+02  -1.050 0.293886    
## EDUCATION..High.School          5.317e+01  1.831e+02   0.290 0.771482    
## EDUCATION.Bachelors            -2.093e+02  1.678e+02  -1.247 0.212351    
## EDUCATION.Masters               1.424e+01  2.796e+02   0.051 0.959403    
## EDUCATION.PhD                   1.131e+01  3.349e+02   0.034 0.973055    
## JOB.Clerical                    5.534e+02  3.598e+02   1.538 0.124089    
## JOB.Doctor                     -3.587e+02  4.333e+02  -0.828 0.407842    
## JOB.Home.Maker                  5.502e+02  3.729e+02   1.475 0.140155    
## JOB.Lawyer                      2.025e+02  3.147e+02   0.644 0.519871    
## JOB.Manager                    -5.611e+02  3.097e+02  -1.812 0.070078 .  
## JOB.Professional                3.297e+02  3.290e+02   1.002 0.316368    
## JOB.Student                     3.555e+02  3.896e+02   0.912 0.361611    
## JOB.z_Blue.Collar               5.860e+02  3.427e+02   1.710 0.087284 .  
## TRAVTIME                        1.220e+01  3.452e+00   3.536 0.000409 ***
## CAR_USE.Private                -6.584e+02  1.747e+02  -3.769 0.000165 ***
## BLUEBOOK                        3.566e-03  6.332e-02   0.056 0.955093    
## TIF                            -5.202e+01  1.293e+01  -4.022 5.83e-05 ***
## CAR_TYPE.Minivan               -5.335e+02  1.837e+02  -2.904 0.003703 ** 
## CAR_TYPE.Panel.Truck           -4.252e+01  2.507e+02  -0.170 0.865299    
## CAR_TYPE.Sports.Car             4.227e+02  2.416e+02   1.750 0.080187 .  
## CAR_TYPE.Van                    4.241e+01  2.276e+02   0.186 0.852163    
## CAR_TYPE.z_SUV                  1.960e+02  2.060e+02   0.951 0.341559    
## RED_CAR.yes                     1.257e+02  1.587e+02   0.792 0.428507    
## OLDCLAIM                        4.431e-02  8.977e-02   0.494 0.621637    
## CLM_FREQ                        2.300e+01  6.777e+01   0.339 0.734354    
## REVOKED.Yes                     4.373e+02  1.662e+02   2.632 0.008517 ** 
## MVR_PTS                         1.868e+02  2.794e+01   6.686 2.48e-11 ***
## CAR_AGE                        -2.691e+01  1.357e+01  -1.984 0.047348 *  
## URBANICITY.Highly.Urban..Urban  1.735e+03  1.480e+02  11.719  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4321 on 6490 degrees of freedom
## Multiple R-squared:  0.07982,    Adjusted R-squared:  0.07458 
## F-statistic: 15.22 on 37 and 6490 DF,  p-value: < 2.2e-16

There are a lot of variable with low signifiance and high p-value, so we will try to only leave the variable with high significance for the next model.

reg1_1<-lm(formula = TARGET_AMT ~ KIDSDRIV + PARENT1.Yes + MSTATUS.Yes + TRAVTIME + CAR_USE.Private + TIF + CAR_TYPE.Minivan + REVOKED.Yes + MVR_PTS + CAR_AGE + URBANICITY.Highly.Urban..Urban, data = ins_df)
summary(reg1_1)
## 
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + PARENT1.Yes + MSTATUS.Yes + 
##     TRAVTIME + CAR_USE.Private + TIF + CAR_TYPE.Minivan + REVOKED.Yes + 
##     MVR_PTS + CAR_AGE + URBANICITY.Highly.Urban..Urban, data = ins_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5883  -1686   -800    325  83348 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     991.532    236.393   4.194 2.77e-05 ***
## KIDSDRIV                        385.208    109.887   3.505 0.000459 ***
## PARENT1.Yes                     679.254    186.620   3.640 0.000275 ***
## MSTATUS.Yes                    -623.484    126.821  -4.916 9.04e-07 ***
## TRAVTIME                         13.295      3.451   3.853 0.000118 ***
## CAR_USE.Private                -693.697    113.927  -6.089 1.20e-09 ***
## TIF                             -50.771     12.933  -3.926 8.74e-05 ***
## CAR_TYPE.Minivan               -615.343    124.555  -4.940 7.99e-07 ***
## REVOKED.Yes                     492.546    165.595   2.974 0.002946 ** 
## MVR_PTS                         216.220     25.723   8.406  < 2e-16 ***
## CAR_AGE                         -57.501      9.605  -5.987 2.26e-09 ***
## URBANICITY.Highly.Urban..Urban 1521.837    138.376  10.998  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4333 on 6516 degrees of freedom
## Multiple R-squared:  0.07098,    Adjusted R-squared:  0.06941 
## F-statistic: 45.26 on 11 and 6516 DF,  p-value: < 2.2e-16

We can see improvements in R-Squared value after we narrowed the model to highly significant variables. It is however still rather low. Only 7% of the variation in the data can be explained by this model.

We will now use this regression model to predict values for the evaluation data set.

reg_pred <- predict(reg1_1, df_eval, type="response",se.fit=FALSE)
# se.fit=FALSE
df_eval$TARGET_AMT <- reg_pred

write.csv(df_eval,"eval_results.csv", row.names = FALSE)