library("psych")
library("caret")
library("pROC")
library("dplyr")
library("car")
library("kableExtra")
library("mlbench")

#devtools::session_info()

loc_train = "https://raw.githubusercontent.com/chrisestevez/DataAnalytics/master/Data/Hw4/insurance_training_data.csv"
loc_test = "https://raw.githubusercontent.com/chrisestevez/DataAnalytics/master/Data/Hw4/insurance-evaluation-data.csv"

train_df = read.csv(loc_train, stringsAsFactors = FALSE)
test_df = read.csv(loc_test, stringsAsFactors = FALSE)

MY_ROC = function(labels, scores,pname){
  labels = labels[order(scores, decreasing=TRUE)]
  result =data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels)
  
  dFPR = c(diff(result$FPR), 0)
  dTPR = c(diff(result$TPR), 0)
  AUC = round(sum(result$TPR * dFPR) + sum(dTPR * dFPR)/2,4)

  plot(result$FPR,result$TPR,type="l",main =paste0(pname," ROC Curve"),ylab="Sensitivity",xlab="1-Specificity")
  abline(a=0,b=1)
  legend(.6,.2,AUC,title = "AUC")
  
}

#?describe
#describe(train_df,na.rm = TRUE)

Overview

In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A 1 means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.

Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:

Deliverables:

  • A write-up submitted in PDF format. Your write-up should have four sections. Each one is described below. You may assume you are addressing me as a fellow data scientist, so do not need to shy away from technical details.

  • Assigned predictions (probabilities, classifications, cost) for the evaluation data set. Use 0.5 threshold.

  • Include your R statistical programming code in an Appendix.

Insurance Institute for Highway Safety

DATA PREPARATION

For data preparation, I started by visiting the Insurance Institute for highway safety. Their website highlighted age groups where there was a decrease in accidents. With this idea in mind, I created an age factor group by IIHS buckets. I also created various dummy variable such as KIDSDRIV_DUMMY indicating a 1 if true or zero if false. This operation was also performed for house ownership, masters degree, Ph.D. degree. Specific variables had missing values and were replaced with the median values. Additionally, currency columns were cleaned into numeric, and other variables were converted to factors. Due to high colinearity, certain variables were removed from the dataset.

"KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
## [1] "KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
train_df$KIDSDRIV_Dummy = ifelse(train_df$KIDSDRIV==0,0,1)

#The age seems to folow a normal dist 
train_df$AGE[is.na(train_df$AGE)]= median(train_df$AGE,na.rm = TRUE)

#Created age groups based on IIHS 
train_df$AGE_CAT = ifelse(train_df$AGE>19,ifelse(train_df$AGE>24,ifelse(train_df$AGE>29,ifelse(train_df$AGE>34,ifelse(train_df$AGE>54,ifelse(train_df$AGE>59,ifelse(train_df$AGE>64,ifelse(train_df$AGE>69,ifelse(train_df$AGE>74,ifelse(train_df$AGE>79,"80+","75-79"),"70-74"),"65-69"),"60-64"),"55-59"),"35-54"),"30-34"),"25-29"),"20-24"),"16-19")


train_df$AGE_CAT =factor(train_df$AGE_CAT,levels=c("16-19","20-24","25-29","30-34","35-54","55-59","60-64","65-69","70-74", "75-79","80+"
),ordered=TRUE)

"convert variable kids at home into a dummy variable"
## [1] "convert variable kids at home into a dummy variable"
train_df$HOMEKIDS_Dummy = ifelse(train_df$HOMEKIDS==0,0,1)

train_df$AGE[is.na(train_df$AGE)]= median(train_df$AGE,na.rm = TRUE)

train_df$YOJ[is.na(train_df$YOJ)]= median(train_df$YOJ,na.rm = TRUE)
#train_df$YOJ[train_df$YOJ==0]= 0


"cleans simbols from numeric variables"
## [1] "cleans simbols from numeric variables"
train_df$INCOME = as.numeric(gsub('\\$|,', '', train_df$INCOME))
train_df$INCOME[is.na(train_df$INCOME)]= 0

train_df$PARENT1 = factor(train_df$PARENT1)

train_df$HOME_VAL = as.numeric(gsub('\\$|,', '', train_df$HOME_VAL))

train_df$HOME_VAL[is.na(train_df$HOME_VAL)]= 0

train_df$HASHOME_Dummy = ifelse(train_df$HOME_VAL==0,0,1)

train_df$MSTATUS = factor(train_df$MSTATUS)

train_df$SEX = factor(train_df$SEX)

train_df$Masters_Dummy =ifelse(train_df$EDUCATION %in% c("Masters"), 1,0)

train_df$PHD_Dummy =ifelse(train_df$EDUCATION %in% c("PhD"), 1,0)

train_df$JOB[train_df$JOB==""]= "UNKNOWN"

train_df$JOB = factor(train_df$JOB)

train_df$CAR_USE = factor(train_df$CAR_USE)

train_df$BLUEBOOK = as.numeric(gsub('\\$|,', '', train_df$BLUEBOOK))

train_df$CAR_TYPE = factor(train_df$CAR_TYPE)

train_df$RED_CAR = factor(train_df$RED_CAR)

train_df$OLDCLAIM = as.numeric(gsub('\\$|,', '', train_df$OLDCLAIM))

train_df$CLM_FREQ_Dummy = ifelse(train_df$CLM_FREQ==0,0,1)

train_df$REVOKED = factor(train_df$REVOKED)

train_df$URBANICITY = factor(train_df$URBANICITY)

train_df$CAR_AGE[is.na(train_df$CAR_AGE)]= 0
train_df$CAR_AGE[train_df$CAR_AGE==-3]= 3


train_df = train_df %>% select(-KIDSDRIV,-HOMEKIDS,-CLM_FREQ,-HOME_VAL,-INDEX)

train_df$TARGET_FLAG = factor(train_df$TARGET_FLAG)

summary(train_df)
##  TARGET_FLAG   TARGET_AMT          AGE             YOJ       
##  0:6008      Min.   :     0   Min.   :16.00   Min.   : 0.00  
##  1:2153      1st Qu.:     0   1st Qu.:39.00   1st Qu.: 9.00  
##              Median :     0   Median :45.00   Median :11.00  
##              Mean   :  1504   Mean   :44.79   Mean   :10.53  
##              3rd Qu.:  1036   3rd Qu.:51.00   3rd Qu.:13.00  
##              Max.   :107586   Max.   :81.00   Max.   :23.00  
##                                                              
##      INCOME       PARENT1    MSTATUS      SEX        EDUCATION        
##  Min.   :     0   No :7084   Yes :4894   M  :3786   Length:8161       
##  1st Qu.: 23157   Yes:1077   z_No:3267   z_F:4375   Class :character  
##  Median : 51116                                     Mode  :character  
##  Mean   : 58523                                                       
##  3rd Qu.: 83304                                                       
##  Max.   :367030                                                       
##                                                                       
##             JOB          TRAVTIME            CAR_USE        BLUEBOOK    
##  z_Blue Collar:1825   Min.   :  5.00   Commercial:3029   Min.   : 1500  
##  Clerical     :1271   1st Qu.: 22.00   Private   :5132   1st Qu.: 9280  
##  Professional :1117   Median : 33.00                     Median :14440  
##  Manager      : 988   Mean   : 33.49                     Mean   :15710  
##  Lawyer       : 835   3rd Qu.: 44.00                     3rd Qu.:20850  
##  Student      : 712   Max.   :142.00                     Max.   :69740  
##  (Other)      :1413                                                     
##       TIF                CAR_TYPE    RED_CAR       OLDCLAIM     REVOKED   
##  Min.   : 1.000   Minivan    :2145   no :5783   Min.   :    0   No :7161  
##  1st Qu.: 1.000   Panel Truck: 676   yes:2378   1st Qu.:    0   Yes:1000  
##  Median : 4.000   Pickup     :1389              Median :    0             
##  Mean   : 5.351   Sports Car : 907              Mean   : 4037             
##  3rd Qu.: 7.000   Van        : 750              3rd Qu.: 4636             
##  Max.   :25.000   z_SUV      :2294              Max.   :57037             
##                                                                           
##     MVR_PTS          CAR_AGE                       URBANICITY  
##  Min.   : 0.000   Min.   : 0.000   Highly Urban/ Urban  :6492  
##  1st Qu.: 0.000   1st Qu.: 1.000   z_Highly Rural/ Rural:1669  
##  Median : 1.000   Median : 8.000                               
##  Mean   : 1.696   Mean   : 7.809                               
##  3rd Qu.: 3.000   3rd Qu.:12.000                               
##  Max.   :13.000   Max.   :28.000                               
##                                                                
##  KIDSDRIV_Dummy      AGE_CAT     HOMEKIDS_Dummy   HASHOME_Dummy   
##  Min.   :0.0000   35-54  :6121   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   55-59  : 725   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   30-34  : 649   Median :0.0000   Median :1.0000  
##  Mean   :0.1202   60-64  : 265   Mean   :0.3519   Mean   :0.6621  
##  3rd Qu.:0.0000   25-29  : 249   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   65-69  :  65   Max.   :1.0000   Max.   :1.0000  
##                   (Other):  87                                    
##  Masters_Dummy      PHD_Dummy      CLM_FREQ_Dummy  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2032   Mean   :0.0892   Mean   :0.3862  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
## 

DATA EXPLORATION

knitr::kable(round(describe(train_df,na.rm = TRUE),2), format = "html", booktabs = T) %>% kable_styling(latex_options = c("striped", "scale_down"))
vars n mean sd median trimmed mad min max range skew kurtosis se
TARGET_FLAG* 1 8161 1.26 0.44 1 1.20 0.00 1 2.0 1.0 1.07 -0.85 0.00
TARGET_AMT 2 8161 1504.32 4704.03 0 593.71 0.00 0 107586.1 107586.1 8.71 112.29 52.07
AGE 3 8161 44.79 8.62 45 44.83 8.90 16 81.0 65.0 -0.03 -0.06 0.10
YOJ 4 8161 10.53 3.98 11 11.08 2.97 0 23.0 23.0 -1.26 1.45 0.04
INCOME 5 8161 58522.94 48345.51 51116 52958.35 44211.13 0 367030.0 367030.0 1.16 1.99 535.16
PARENT1* 6 8161 1.13 0.34 1 1.04 0.00 1 2.0 1.0 2.17 2.73 0.00
MSTATUS* 7 8161 1.40 0.49 1 1.38 0.00 1 2.0 1.0 0.41 -1.83 0.01
SEX* 8 8161 1.54 0.50 2 1.55 0.00 1 2.0 1.0 -0.14 -1.98 0.01
EDUCATION* 9 8161 NaN NA NA NaN NA Inf -Inf -Inf NA NA NA
JOB* 10 8161 5.43 2.76 6 5.53 2.97 1 9.0 8.0 -0.21 -1.17 0.03
TRAVTIME 11 8161 33.49 15.91 33 33.00 16.31 5 142.0 137.0 0.45 0.66 0.18
CAR_USE* 12 8161 1.63 0.48 2 1.66 0.00 1 2.0 1.0 -0.53 -1.72 0.01
BLUEBOOK 13 8161 15709.90 8419.73 14440 15036.89 8450.82 1500 69740.0 68240.0 0.79 0.79 93.20
TIF 14 8161 5.35 4.15 4 4.84 4.45 1 25.0 24.0 0.89 0.42 0.05
CAR_TYPE* 15 8161 3.53 1.97 3 3.54 2.97 1 6.0 5.0 0.00 -1.52 0.02
RED_CAR* 16 8161 1.29 0.45 1 1.24 0.00 1 2.0 1.0 0.92 -1.16 0.01
OLDCLAIM 17 8161 4037.08 8777.14 0 1719.29 0.00 0 57037.0 57037.0 3.12 9.86 97.16
REVOKED* 18 8161 1.12 0.33 1 1.03 0.00 1 2.0 1.0 2.30 3.30 0.00
MVR_PTS 19 8161 1.70 2.15 1 1.31 1.48 0 13.0 13.0 1.35 1.38 0.02
CAR_AGE 20 8161 7.81 5.88 8 7.41 7.41 0 28.0 28.0 0.33 -0.81 0.07
URBANICITY* 21 8161 1.20 0.40 1 1.13 0.00 1 2.0 1.0 1.46 0.15 0.00
KIDSDRIV_Dummy 22 8161 0.12 0.33 0 0.03 0.00 0 1.0 1.0 2.34 3.45 0.00
AGE_CAT* 23 8161 5.02 0.79 5 5.02 0.00 1 11.0 10.0 0.14 6.41 0.01
HOMEKIDS_Dummy 24 8161 0.35 0.48 0 0.31 0.00 0 1.0 1.0 0.62 -1.62 0.01
HASHOME_Dummy 25 8161 0.66 0.47 1 0.70 0.00 0 1.0 1.0 -0.69 -1.53 0.01
Masters_Dummy 26 8161 0.20 0.40 0 0.13 0.00 0 1.0 1.0 1.48 0.18 0.00
PHD_Dummy 27 8161 0.09 0.29 0 0.00 0.00 0 1.0 1.0 2.88 6.31 0.00
CLM_FREQ_Dummy 28 8161 0.39 0.49 0 0.36 0.00 0 1.0 1.0 0.47 -1.78 0.01

There seems to be some high correlation between age and home kids, old claims and claim frequency, mvr pts and claim frequency, and car age and masters degree.

library("corrplot")
cor_mx = cor(dplyr::select_if(train_df, is.numeric) ,use="pairwise.complete.obs", method = "pearson")
corrplot(cor_mx, method = "color", 
         type = "upper", order = "original", number.cex = .7,
         addCoef.col = "black", # Add coefficient of correlation
         tl.col = "black", tl.srt = 90, # Text label color and rotation
                  # hide correlation coefficient on the principal diagonal
         diag = TRUE)

library("PerformanceAnalytics")
Numericcols = as.data.frame(select_if(train_df, is.numeric))

chart.Correlation(Numericcols[,2:10])

chart.Correlation(Numericcols[,11:ncol(Numericcols)])

BUILD MODELS

SVM MOdel

My first model is the SVMRadial model the model’s accuracy is 97%, and the kappa is 92.

control = trainControl(method="repeatedcv", number=3, repeats=1)

set.seed(7)
modelSvm = train(TARGET_FLAG~., data=train_df, method="svmRadial", trControl=control)

train_df$mypredictedSVM = predict(modelSvm,train_df)

M1 =confusionMatrix(factor(train_df$mypredictedSVM),factor(train_df$TARGET_FLAG),dnn = c("Prediction", "Reference"))
M1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6008  220
##          1    0 1933
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9693, 0.9764)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9282          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8978          
##          Pos Pred Value : 0.9647          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7362          
##          Detection Rate : 0.7362          
##    Detection Prevalence : 0.7631          
##       Balanced Accuracy : 0.9489          
##                                           
##        'Positive' Class : 0               
## 

SVM Model BOXCOX

My first model is the SVMRadial with a BoxCox transformation model the model’s accuracy is 100%, and the kappa is 100.

control = trainControl(method="repeatedcv", number=3, repeats=1)

set.seed(7)
modelSvmBoxCox = train(TARGET_FLAG~., data=train_df, method="svmRadial", trControl=control,preProcess = "BoxCox")

train_df$modelSvmBoxCox = predict(modelSvmBoxCox,train_df)

M2= confusionMatrix(factor(train_df$modelSvmBoxCox),factor(train_df$TARGET_FLAG),dnn = c("Prediction", "Reference"))
M2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6008  220
##          1    0 1933
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9693, 0.9764)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9282          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.8978          
##          Pos Pred Value : 0.9647          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.7362          
##          Detection Rate : 0.7362          
##    Detection Prevalence : 0.7631          
##       Balanced Accuracy : 0.9489          
##                                           
##        'Positive' Class : 0               
## 
summary(modelSvmBoxCox)
## Length  Class   Mode 
##      1   ksvm     S4

GBM

The GBM model was quite attractive due to the 100% accuracy and an AUC of 1. I am not entirely confident if the results are accurate. The reason this model was selected over the GLM was due to its ability to handle factors within the model. I did not try tweaking the model due to its high kappa value.

objControl = trainControl(method='repeatedcv', number=3, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE)

train_df$TARGET_FLAG =ifelse(train_df$TARGET_FLAG==1,'ACDNT','noACDNT')
train_df$TARGET_FLAG = as.factor(train_df$TARGET_FLAG)

set.seed(7)
modelgbm = train(TARGET_FLAG~., train_df, 
                  method='gbm', 
                  trControl=objControl,  
                  metric = "ROC",
                  preProc = c("center", "scale"))
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9642             nan     0.1000    0.0959
##      2        0.8249             nan     0.1000    0.0709
##      3        0.7150             nan     0.1000    0.0556
##      4        0.6250             nan     0.1000    0.0447
##      5        0.5497             nan     0.1000    0.0377
##      6        0.4857             nan     0.1000    0.0319
##      7        0.4307             nan     0.1000    0.0280
##      8        0.3829             nan     0.1000    0.0237
##      9        0.3413             nan     0.1000    0.0208
##     10        0.3048             nan     0.1000    0.0181
##     20        0.1045             nan     0.1000    0.0058
##     40        0.0137             nan     0.1000    0.0007
##     60        0.0019             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0000             nan     0.1000    0.0000
##    120        0.0000             nan     0.1000    0.0000
##    140        0.0000             nan     0.1000   -0.0000
##    150        0.0000             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9642             nan     0.1000    0.0925
##      2        0.8249             nan     0.1000    0.0682
##      3        0.7151             nan     0.1000    0.0540
##      4        0.6251             nan     0.1000    0.0456
##      5        0.5497             nan     0.1000    0.0379
##      6        0.4858             nan     0.1000    0.0320
##      7        0.4307             nan     0.1000    0.0276
##      8        0.3831             nan     0.1000    0.0239
##      9        0.3415             nan     0.1000    0.0207
##     10        0.3049             nan     0.1000    0.0182
##     20        0.1045             nan     0.1000    0.0057
##     40        0.0137             nan     0.1000    0.0007
##     60        0.0019             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0000             nan     0.1000    0.0000
##    120        0.0000             nan     0.1000    0.0000
##    140        0.0000             nan     0.1000    0.0000
##    150        0.0000             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9642             nan     0.1000    0.0949
##      2        0.8248             nan     0.1000    0.0692
##      3        0.7149             nan     0.1000    0.0564
##      4        0.6251             nan     0.1000    0.0449
##      5        0.5497             nan     0.1000    0.0381
##      6        0.4857             nan     0.1000    0.0314
##      7        0.4307             nan     0.1000    0.0278
##      8        0.3830             nan     0.1000    0.0236
##      9        0.3414             nan     0.1000    0.0209
##     10        0.3049             nan     0.1000    0.0183
##     20        0.1045             nan     0.1000    0.0057
##     40        0.0137             nan     0.1000    0.0007
##     60        0.0019             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0000             nan     0.1000    0.0000
##    120        0.0000             nan     0.1000    0.0000
##    140        0.0000             nan     0.1000   -0.0000
##    150        0.0000             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9643             nan     0.1000    0.0945
##      2        0.8250             nan     0.1000    0.0702
##      3        0.7150             nan     0.1000    0.0544
##      4        0.6251             nan     0.1000    0.0450
##      5        0.5498             nan     0.1000    0.0374
##      6        0.4859             nan     0.1000    0.0319
##      7        0.4308             nan     0.1000    0.0273
##      8        0.3831             nan     0.1000    0.0239
##      9        0.3415             nan     0.1000    0.0208
##     10        0.3049             nan     0.1000    0.0182
##     20        0.1046             nan     0.1000    0.0057
##     40        0.0138             nan     0.1000    0.0007
##     60        0.0019             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0002             nan     0.1000   -0.0000
##    120        0.0001             nan     0.1000   -0.0000
##    140        0.0001             nan     0.1000   -0.0000
##    150        0.0001             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9646             nan     0.1000    0.0912
##      2        0.8251             nan     0.1000    0.0703
##      3        0.7151             nan     0.1000    0.0554
##      4        0.6251             nan     0.1000    0.0455
##      5        0.5499             nan     0.1000    0.0379
##      6        0.4859             nan     0.1000    0.0319
##      7        0.4308             nan     0.1000    0.0274
##      8        0.3832             nan     0.1000    0.0239
##      9        0.3416             nan     0.1000    0.0206
##     10        0.3050             nan     0.1000    0.0184
##     20        0.1046             nan     0.1000    0.0056
##     40        0.0139             nan     0.1000    0.0007
##     60        0.0020             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0001             nan     0.1000    0.0000
##    120        0.0000             nan     0.1000    0.0000
##    140        0.0000             nan     0.1000   -0.0000
##    150        0.0000             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9645             nan     0.1000    0.0924
##      2        0.8252             nan     0.1000    0.0691
##      3        0.7154             nan     0.1000    0.0535
##      4        0.6253             nan     0.1000    0.0456
##      5        0.5501             nan     0.1000    0.0373
##      6        0.4861             nan     0.1000    0.0315
##      7        0.4310             nan     0.1000    0.0279
##      8        0.3834             nan     0.1000    0.0236
##      9        0.3418             nan     0.1000    0.0211
##     10        0.3054             nan     0.1000    0.0183
##     20        0.1046             nan     0.1000    0.0058
##     40        0.0138             nan     0.1000    0.0007
##     60        0.0019             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0001             nan     0.1000   -0.0000
##    120        0.0000             nan     0.1000    0.0000
##    140        0.0000             nan     0.1000   -0.0000
##    150        0.0000             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9647             nan     0.1000    0.0938
##      2        0.8252             nan     0.1000    0.0698
##      3        0.7152             nan     0.1000    0.0552
##      4        0.6252             nan     0.1000    0.0451
##      5        0.5499             nan     0.1000    0.0373
##      6        0.4859             nan     0.1000    0.0315
##      7        0.4309             nan     0.1000    0.0277
##      8        0.3831             nan     0.1000    0.0238
##      9        0.3415             nan     0.1000    0.0209
##     10        0.3050             nan     0.1000    0.0185
##     20        0.1046             nan     0.1000    0.0057
##     40        0.0137             nan     0.1000    0.0007
##     60        0.0020             nan     0.1000    0.0001
##     80        0.0006             nan     0.1000    0.0000
##    100        0.0003             nan     0.1000   -0.0000
##    120        0.0002             nan     0.1000    0.0000
##    140        0.0001             nan     0.1000    0.0000
##    150        0.0001             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9647             nan     0.1000    0.0939
##      2        0.8252             nan     0.1000    0.0682
##      3        0.7152             nan     0.1000    0.0554
##      4        0.6252             nan     0.1000    0.0456
##      5        0.5499             nan     0.1000    0.0378
##      6        0.4859             nan     0.1000    0.0316
##      7        0.4309             nan     0.1000    0.0273
##      8        0.3831             nan     0.1000    0.0238
##      9        0.3415             nan     0.1000    0.0208
##     10        0.3050             nan     0.1000    0.0182
##     20        0.1046             nan     0.1000    0.0056
##     40        0.0138             nan     0.1000    0.0007
##     60        0.0019             nan     0.1000    0.0001
##     80        0.0003             nan     0.1000    0.0000
##    100        0.0001             nan     0.1000    0.0000
##    120        0.0001             nan     0.1000   -0.0000
##    140        0.0001             nan     0.1000   -0.0000
##    150        0.0001             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9647             nan     0.1000    0.0949
##      2        0.8252             nan     0.1000    0.0694
##      3        0.7154             nan     0.1000    0.0542
##      4        0.6253             nan     0.1000    0.0448
##      5        0.5501             nan     0.1000    0.0376
##      6        0.4861             nan     0.1000    0.0320
##      7        0.4310             nan     0.1000    0.0272
##      8        0.3832             nan     0.1000    0.0240
##      9        0.3415             nan     0.1000    0.0208
##     10        0.3051             nan     0.1000    0.0181
##     20        0.1048             nan     0.1000    0.0057
##     40        0.0139             nan     0.1000    0.0007
##     60        0.0020             nan     0.1000    0.0001
##     80        0.0004             nan     0.1000    0.0000
##    100        0.0001             nan     0.1000   -0.0000
##    120        0.0001             nan     0.1000    0.0000
##    140        0.0001             nan     0.1000    0.0000
##    150        0.0000             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.9644             nan     0.1000    0.0931
##      2        0.8250             nan     0.1000    0.0695
##      3        0.7151             nan     0.1000    0.0552
##      4        0.6251             nan     0.1000    0.0446
##      5        0.5498             nan     0.1000    0.0373
##      6        0.4858             nan     0.1000    0.0318
##      7        0.4308             nan     0.1000    0.0277
##      8        0.3831             nan     0.1000    0.0239
##      9        0.3415             nan     0.1000    0.0208
##     10        0.3049             nan     0.1000    0.0183
##     20        0.1045             nan     0.1000    0.0057
##     40        0.0137             nan     0.1000    0.0007
##     50        0.0051             nan     0.1000    0.0003
train_df$mypredictedGBM = predict(modelgbm,train_df, type='raw')

trainPROBS = predict(modelgbm,train_df, type='prob')
M3=confusionMatrix(factor(train_df$mypredictedGBM),factor(train_df$TARGET_FLAG),dnn = c("Prediction", "Reference"))

M3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ACDNT noACDNT
##    ACDNT    2153       0
##    noACDNT     0    6008
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9995, 1)
##     No Information Rate : 0.7362     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.2638     
##          Detection Rate : 0.2638     
##    Detection Prevalence : 0.2638     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : ACDNT      
## 
MY_ROC(as.numeric(ifelse(train_df$TARGET_FLAG=="ACDNT",1,0)),trainPROBS$ACDNT,"Model2")

roc(train_df$TARGET_FLAG,trainPROBS$ACDNT)
## 
## Call:
## roc.default(response = train_df$TARGET_FLAG, predictor = trainPROBS$ACDNT)
## 
## Data: trainPROBS$ACDNT in 2153 controls (train_df$TARGET_FLAG ACDNT) > 6008 cases (train_df$TARGET_FLAG noACDNT).
## Area under the curve: 1

Linear models

Model 1

The first linear model I selected subsetted the data frame. Many of the dummy variables became statistically significant. Variables that significantly increase claim price are travel time, revoked license, sports car, and claim frequency. Additionally, I scaled the variable but there was little difference noticed. The model was check for high colinearity and there was none, the adjusted r square is .045 the RMSE is 4582.52.

control = trainControl(method="repeatedcv", number=3, repeats=1)
#train_df$TARGET_FLAG = ifelse(train_df$TARGET_FLAG=="ACDNT",1,0)

LM_df =train_df %>%  select(TARGET_AMT,AGE,YOJ,INCOME, TRAVTIME,BLUEBOOK,OLDCLAIM,MVR_PTS,CAR_AGE,KIDSDRIV_Dummy,HOMEKIDS_Dummy,HASHOME_Dummy,Masters_Dummy,PHD_Dummy,CLM_FREQ_Dummy,SEX,CAR_USE,CAR_TYPE,RED_CAR,REVOKED)


set.seed(7)
modelLM1 = train(TARGET_AMT~., data=LM_df %>% select(-AGE,-YOJ,-RED_CAR,-PHD_Dummy,-Masters_Dummy,-BLUEBOOK,-SEX) , method="lm", trControl=control)



summary(modelLM1)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5235  -1645   -862     78 104074 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.226e+03  2.203e+02   5.568 2.67e-08 ***
## INCOME                -2.542e-03  1.176e-03  -2.162 0.030678 *  
## TRAVTIME               6.690e+00  3.206e+00   2.087 0.036941 *  
## OLDCLAIM              -2.151e-02  8.044e-03  -2.674 0.007512 ** 
## MVR_PTS                1.869e+02  2.690e+01   6.948 4.00e-12 ***
## CAR_AGE               -2.284e+01  9.349e+00  -2.443 0.014590 *  
## KIDSDRIV_Dummy         4.987e+02  1.787e+02   2.791 0.005271 ** 
## HOMEKIDS_Dummy         3.121e+02  1.239e+02   2.520 0.011753 *  
## HASHOME_Dummy         -4.858e+02  1.087e+02  -4.469 7.97e-06 ***
## CLM_FREQ_Dummy         8.849e+02  1.445e+02   6.126 9.44e-10 ***
## CAR_USEPrivate        -7.559e+02  1.264e+02  -5.980 2.32e-09 ***
## `CAR_TYPEPanel Truck`  4.632e+02  2.296e+02   2.018 0.043658 *  
## CAR_TYPEPickup         3.084e+02  1.666e+02   1.852 0.064106 .  
## `CAR_TYPESports Car`   6.597e+02  1.835e+02   3.596 0.000325 ***
## CAR_TYPEVan            5.811e+02  2.023e+02   2.873 0.004076 ** 
## CAR_TYPEz_SUV          4.064e+02  1.394e+02   2.915 0.003562 ** 
## REVOKEDYes             8.539e+02  1.761e+02   4.849 1.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4596 on 8144 degrees of freedom
## Multiple R-squared:  0.04718,    Adjusted R-squared:  0.04531 
## F-statistic: 25.21 on 16 and 8144 DF,  p-value: < 2.2e-16
car::vif(modelLM1$finalModel)
##                INCOME              TRAVTIME              OLDCLAIM 
##              1.248982              1.004751              1.925567 
##               MVR_PTS               CAR_AGE        KIDSDRIV_Dummy 
##              1.288589              1.165372              1.304497 
##        HOMEKIDS_Dummy         HASHOME_Dummy        CLM_FREQ_Dummy 
##              1.351526              1.021534              1.911021 
##        CAR_USEPrivate `CAR_TYPEPanel Truck`        CAR_TYPEPickup 
##              1.440733              1.546657              1.513736 
##  `CAR_TYPESports Car`           CAR_TYPEVan         CAR_TYPEz_SUV 
##              1.284338              1.318866              1.517278 
##            REVOKEDYes 
##              1.288196
modelLM1$results
##   intercept    RMSE   Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 4582.52 0.04462884 2006.638 488.3737 0.00325737 94.59403

model 2

The second model I selected all variables.variables. The variables were scaled and center the results were similar the first model with an adjusted r squared of .045 and RMSE of 4583.656.

control = trainControl(method="repeatedcv", number=3, repeats=1)

set.seed(7)
modelLM2 = train(TARGET_AMT~., data=LM_df , method="lm", trControl=control,preProcess = c("center", "scale"))

#c("center", "scale")

modelLM2$results
##   intercept     RMSE   Rsquared      MAE   RMSESD  RsquaredSD    MAESD
## 1      TRUE 4583.656 0.04419743 2013.002 487.1691 0.003022803 90.60408
summary(modelLM2)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5196  -1648   -872     87 104110 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1504.3246    50.8786  29.567  < 2e-16 ***
## AGE                      0.3269    62.5428   0.005 0.995829    
## YOJ                     -7.2604    55.0647  -0.132 0.895105    
## INCOME                -165.4941    65.8724  -2.512 0.012012 *  
## TRAVTIME               108.8801    51.0544   2.133 0.032985 *  
## BLUEBOOK               126.7672    73.0987   1.734 0.082922 .  
## OLDCLAIM              -189.2289    70.6250  -2.679 0.007391 ** 
## MVR_PTS                403.0278    57.8076   6.972 3.37e-12 ***
## CAR_AGE               -165.6099    62.7880  -2.638 0.008365 ** 
## KIDSDRIV_Dummy         158.0869    59.2933   2.666 0.007687 ** 
## HOMEKIDS_Dummy         170.3104    69.4497   2.452 0.014216 *  
## HASHOME_Dummy         -231.2843    52.4660  -4.408 1.06e-05 ***
## Masters_Dummy           64.5269    64.4588   1.001 0.316829    
## PHD_Dummy               46.5770    62.6910   0.743 0.457526    
## CLM_FREQ_Dummy         429.2143    70.3952   6.097 1.13e-09 ***
## SEXz_F                -182.5243    91.8808  -1.987 0.047007 *  
## CAR_USEPrivate        -376.3863    62.4016  -6.032 1.69e-09 ***
## `CAR_TYPEPanel Truck`   42.7397    72.8492   0.587 0.557430    
## CAR_TYPEPickup         124.0336    63.2721   1.960 0.049993 *  
## `CAR_TYPESports Car`   292.2436    69.1708   4.225 2.42e-05 ***
## CAR_TYPEVan            124.8957    60.9474   2.049 0.040471 *  
## CAR_TYPEz_SUV          300.7093    81.4739   3.691 0.000225 ***
## RED_CARyes             -13.6309    68.4483  -0.199 0.842157    
## REVOKEDYes             278.4177    57.7800   4.819 1.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4596 on 8137 degrees of freedom
## Multiple R-squared:  0.04798,    Adjusted R-squared:  0.04528 
## F-statistic: 17.83 on 23 and 8137 DF,  p-value: < 2.2e-16
car::vif(modelLM2$finalModel)
##                   AGE                   YOJ                INCOME 
##              1.510886              1.171177              1.676040 
##              TRAVTIME              BLUEBOOK              OLDCLAIM 
##              1.006800              2.063935              1.926611 
##               MVR_PTS               CAR_AGE        KIDSDRIV_Dummy 
##              1.290763              1.522754              1.357965 
##        HOMEKIDS_Dummy         HASHOME_Dummy         Masters_Dummy 
##              1.863020              1.063243              1.604873 
##             PHD_Dummy        CLM_FREQ_Dummy                SEXz_F 
##              1.518055              1.914091              3.260813 
##        CAR_USEPrivate `CAR_TYPEPanel Truck`        CAR_TYPEPickup 
##              1.504071              2.049871              1.546328 
##  `CAR_TYPESports Car`           CAR_TYPEVan         CAR_TYPEz_SUV 
##              1.848087              1.434785              2.563974 
##            RED_CARyes            REVOKEDYes 
##              1.809684              1.289530

SELECT MODELS

For the final model, i will choose model 1 for the binary logistic regression and model 1. I felt more confident using model1 in the binary logistic model. Model 3 was just perfect.

For the linear regression, model1 seems like the right choice. The model uses fewer varibles it’s statistically significant and has a sligly lower RMSE.

Binary_df =data.frame(M1$byClass)
Binary_df =cbind(Binary_df,M2$byClass)
Binary_df =cbind(Binary_df,M3$byClass)
colnames(Binary_df) = c("Model1","Model2","Model3")

knitr::kable(Binary_df, format = "latex", booktabs = T) 
#linear
Linear_df = data.frame(modelLM1$results)
Linear_df =rbind(Linear_df,modelLM2$results)
row.names(Linear_df) = c("Model1","Model2")

knitr::kable(Linear_df, format = "html", booktabs = T) 
intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
Model1 TRUE 4582.520 0.0446288 2006.638 488.3737 0.0032574 94.59403
Model2 TRUE 4583.656 0.0441974 2013.002 487.1691 0.0030228 90.60408

Predictions

Test_df

I applied all transformations from training df to the test data frame and predicted on the test data.

"KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
## [1] "KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
test_df$KIDSDRIV_Dummy = ifelse(test_df$KIDSDRIV==0,0,1)

#The age seems to folow a normal dist 
test_df$AGE[is.na(test_df$AGE)]= median(test_df$AGE,na.rm = TRUE)

#Created age groups based on IIHS 
test_df$AGE_CAT = ifelse(test_df$AGE>19,ifelse(test_df$AGE>24,ifelse(test_df$AGE>29,ifelse(test_df$AGE>34,ifelse(test_df$AGE>54,ifelse(test_df$AGE>59,ifelse(test_df$AGE>64,ifelse(test_df$AGE>69,ifelse(test_df$AGE>74,ifelse(test_df$AGE>79,"80+","75-79"),"70-74"),"65-69"),"60-64"),"55-59"),"35-54"),"30-34"),"25-29"),"20-24"),"16-19")


test_df$AGE_CAT =factor(test_df$AGE_CAT,levels=c("16-19","20-24","25-29","30-34","35-54","55-59","60-64","65-69","70-74",   "75-79","80+"
),ordered=TRUE)

"convert variable kids at home into a dummy variable"
## [1] "convert variable kids at home into a dummy variable"
test_df$HOMEKIDS_Dummy = ifelse(test_df$HOMEKIDS==0,0,1)

test_df$AGE[is.na(test_df$AGE)]= median(test_df$AGE,na.rm = TRUE)

test_df$YOJ[is.na(test_df$YOJ)]= median(test_df$YOJ,na.rm = TRUE)
#train_df$YOJ[train_df$YOJ==0]= 0


"cleans simbols from numeric variables"
## [1] "cleans simbols from numeric variables"
test_df$INCOME = as.numeric(gsub('\\$|,', '', test_df$INCOME))
test_df$INCOME[is.na(test_df$INCOME)]= 0

test_df$PARENT1 = factor(test_df$PARENT1)

test_df$HOME_VAL = as.numeric(gsub('\\$|,', '', test_df$HOME_VAL))

test_df$HOME_VAL[is.na(test_df$HOME_VAL)]= 0

test_df$HASHOME_Dummy = ifelse(test_df$HOME_VAL==0,0,1)

test_df$MSTATUS = factor(test_df$MSTATUS)

test_df$SEX = factor(test_df$SEX)

test_df$Masters_Dummy =ifelse(test_df$EDUCATION %in% c("Masters"), 1,0)

test_df$PHD_Dummy =ifelse(test_df$EDUCATION %in% c("PhD"), 1,0)

test_df$JOB[test_df$JOB==""]= "UNKNOWN"

test_df$JOB = factor(test_df$JOB)

test_df$CAR_USE = factor(test_df$CAR_USE)

test_df$BLUEBOOK = as.numeric(gsub('\\$|,', '', test_df$BLUEBOOK))

test_df$CAR_TYPE = factor(test_df$CAR_TYPE)

test_df$RED_CAR = factor(test_df$RED_CAR)

test_df$OLDCLAIM = as.numeric(gsub('\\$|,', '', test_df$OLDCLAIM))

test_df$CLM_FREQ_Dummy = ifelse(test_df$CLM_FREQ==0,0,1)

test_df$REVOKED = factor(test_df$REVOKED)

test_df$URBANICITY = factor(test_df$URBANICITY)

test_df$CAR_AGE[is.na(test_df$CAR_AGE)]= 0
test_df$CAR_AGE[test_df$CAR_AGE==-3]= 3


test_df = test_df %>% select(-KIDSDRIV,-HOMEKIDS,-CLM_FREQ,-HOME_VAL,-INDEX)

#test_df$TARGET_FLAG = factor(test_df$TARGET_FLAG)
#test_df$TARGET_AMT = NULL
#test_df$TARGET_FLAG = NULL

test_df$TARGET_AMT=as.numeric(test_df$TARGET_AMT)
test_df$TARGET_FLAG = as.factor(test_df$TARGET_FLAG)

Final predictions

test_df$TARGET_AMT = predict(modelLM1,test_df)

test_df$TARGET_FLAG = predict(modelSvm,test_df)

test_df$TARGET_AMT = ifelse(test_df$TARGET_FLAG==0,0, test_df$TARGET_AMT)



knitr::kable(head(test_df[1:2],10), format = "html", booktabs = T) 
TARGET_FLAG TARGET_AMT
0 0.000
1 2457.516
0 0.000
1 2294.668
0 0.000
1 2189.601
0 0.000
1 2234.380
0 0.000
0 0.000
write.csv(test_df[1:2],"predictions.csv")

Data&Code