library("psych")
library("caret")
library("pROC")
library("dplyr")
library("car")
library("kableExtra")
library("mlbench")
#devtools::session_info()
loc_train = "https://raw.githubusercontent.com/chrisestevez/DataAnalytics/master/Data/Hw4/insurance_training_data.csv"
loc_test = "https://raw.githubusercontent.com/chrisestevez/DataAnalytics/master/Data/Hw4/insurance-evaluation-data.csv"
train_df = read.csv(loc_train, stringsAsFactors = FALSE)
test_df = read.csv(loc_test, stringsAsFactors = FALSE)
MY_ROC = function(labels, scores,pname){
labels = labels[order(scores, decreasing=TRUE)]
result =data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels)
dFPR = c(diff(result$FPR), 0)
dTPR = c(diff(result$TPR), 0)
AUC = round(sum(result$TPR * dFPR) + sum(dTPR * dFPR)/2,4)
plot(result$FPR,result$TPR,type="l",main =paste0(pname," ROC Curve"),ylab="Sensitivity",xlab="1-Specificity")
abline(a=0,b=1)
legend(.6,.2,AUC,title = "AUC")
}
#?describe
#describe(train_df,na.rm = TRUE)
In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A 1 means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.
Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:
A write-up submitted in PDF format. Your write-up should have four sections. Each one is described below. You may assume you are addressing me as a fellow data scientist, so do not need to shy away from technical details.
Assigned predictions (probabilities, classifications, cost) for the evaluation data set. Use 0.5 threshold.
Include your R statistical programming code in an Appendix.
For data preparation, I started by visiting the Insurance Institute for highway safety. Their website highlighted age groups where there was a decrease in accidents. With this idea in mind, I created an age factor group by IIHS buckets. I also created various dummy variable such as KIDSDRIV_DUMMY indicating a 1 if true or zero if false. This operation was also performed for house ownership, masters degree, Ph.D. degree. Specific variables had missing values and were replaced with the median values. Additionally, currency columns were cleaned into numeric, and other variables were converted to factors. Due to high colinearity, certain variables were removed from the dataset.
"KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
## [1] "KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
train_df$KIDSDRIV_Dummy = ifelse(train_df$KIDSDRIV==0,0,1)
#The age seems to folow a normal dist
train_df$AGE[is.na(train_df$AGE)]= median(train_df$AGE,na.rm = TRUE)
#Created age groups based on IIHS
train_df$AGE_CAT = ifelse(train_df$AGE>19,ifelse(train_df$AGE>24,ifelse(train_df$AGE>29,ifelse(train_df$AGE>34,ifelse(train_df$AGE>54,ifelse(train_df$AGE>59,ifelse(train_df$AGE>64,ifelse(train_df$AGE>69,ifelse(train_df$AGE>74,ifelse(train_df$AGE>79,"80+","75-79"),"70-74"),"65-69"),"60-64"),"55-59"),"35-54"),"30-34"),"25-29"),"20-24"),"16-19")
train_df$AGE_CAT =factor(train_df$AGE_CAT,levels=c("16-19","20-24","25-29","30-34","35-54","55-59","60-64","65-69","70-74", "75-79","80+"
),ordered=TRUE)
"convert variable kids at home into a dummy variable"
## [1] "convert variable kids at home into a dummy variable"
train_df$HOMEKIDS_Dummy = ifelse(train_df$HOMEKIDS==0,0,1)
train_df$AGE[is.na(train_df$AGE)]= median(train_df$AGE,na.rm = TRUE)
train_df$YOJ[is.na(train_df$YOJ)]= median(train_df$YOJ,na.rm = TRUE)
#train_df$YOJ[train_df$YOJ==0]= 0
"cleans simbols from numeric variables"
## [1] "cleans simbols from numeric variables"
train_df$INCOME = as.numeric(gsub('\\$|,', '', train_df$INCOME))
train_df$INCOME[is.na(train_df$INCOME)]= 0
train_df$PARENT1 = factor(train_df$PARENT1)
train_df$HOME_VAL = as.numeric(gsub('\\$|,', '', train_df$HOME_VAL))
train_df$HOME_VAL[is.na(train_df$HOME_VAL)]= 0
train_df$HASHOME_Dummy = ifelse(train_df$HOME_VAL==0,0,1)
train_df$MSTATUS = factor(train_df$MSTATUS)
train_df$SEX = factor(train_df$SEX)
train_df$Masters_Dummy =ifelse(train_df$EDUCATION %in% c("Masters"), 1,0)
train_df$PHD_Dummy =ifelse(train_df$EDUCATION %in% c("PhD"), 1,0)
train_df$JOB[train_df$JOB==""]= "UNKNOWN"
train_df$JOB = factor(train_df$JOB)
train_df$CAR_USE = factor(train_df$CAR_USE)
train_df$BLUEBOOK = as.numeric(gsub('\\$|,', '', train_df$BLUEBOOK))
train_df$CAR_TYPE = factor(train_df$CAR_TYPE)
train_df$RED_CAR = factor(train_df$RED_CAR)
train_df$OLDCLAIM = as.numeric(gsub('\\$|,', '', train_df$OLDCLAIM))
train_df$CLM_FREQ_Dummy = ifelse(train_df$CLM_FREQ==0,0,1)
train_df$REVOKED = factor(train_df$REVOKED)
train_df$URBANICITY = factor(train_df$URBANICITY)
train_df$CAR_AGE[is.na(train_df$CAR_AGE)]= 0
train_df$CAR_AGE[train_df$CAR_AGE==-3]= 3
train_df = train_df %>% select(-KIDSDRIV,-HOMEKIDS,-CLM_FREQ,-HOME_VAL,-INDEX)
train_df$TARGET_FLAG = factor(train_df$TARGET_FLAG)
summary(train_df)
## TARGET_FLAG TARGET_AMT AGE YOJ
## 0:6008 Min. : 0 Min. :16.00 Min. : 0.00
## 1:2153 1st Qu.: 0 1st Qu.:39.00 1st Qu.: 9.00
## Median : 0 Median :45.00 Median :11.00
## Mean : 1504 Mean :44.79 Mean :10.53
## 3rd Qu.: 1036 3rd Qu.:51.00 3rd Qu.:13.00
## Max. :107586 Max. :81.00 Max. :23.00
##
## INCOME PARENT1 MSTATUS SEX EDUCATION
## Min. : 0 No :7084 Yes :4894 M :3786 Length:8161
## 1st Qu.: 23157 Yes:1077 z_No:3267 z_F:4375 Class :character
## Median : 51116 Mode :character
## Mean : 58523
## 3rd Qu.: 83304
## Max. :367030
##
## JOB TRAVTIME CAR_USE BLUEBOOK
## z_Blue Collar:1825 Min. : 5.00 Commercial:3029 Min. : 1500
## Clerical :1271 1st Qu.: 22.00 Private :5132 1st Qu.: 9280
## Professional :1117 Median : 33.00 Median :14440
## Manager : 988 Mean : 33.49 Mean :15710
## Lawyer : 835 3rd Qu.: 44.00 3rd Qu.:20850
## Student : 712 Max. :142.00 Max. :69740
## (Other) :1413
## TIF CAR_TYPE RED_CAR OLDCLAIM REVOKED
## Min. : 1.000 Minivan :2145 no :5783 Min. : 0 No :7161
## 1st Qu.: 1.000 Panel Truck: 676 yes:2378 1st Qu.: 0 Yes:1000
## Median : 4.000 Pickup :1389 Median : 0
## Mean : 5.351 Sports Car : 907 Mean : 4037
## 3rd Qu.: 7.000 Van : 750 3rd Qu.: 4636
## Max. :25.000 z_SUV :2294 Max. :57037
##
## MVR_PTS CAR_AGE URBANICITY
## Min. : 0.000 Min. : 0.000 Highly Urban/ Urban :6492
## 1st Qu.: 0.000 1st Qu.: 1.000 z_Highly Rural/ Rural:1669
## Median : 1.000 Median : 8.000
## Mean : 1.696 Mean : 7.809
## 3rd Qu.: 3.000 3rd Qu.:12.000
## Max. :13.000 Max. :28.000
##
## KIDSDRIV_Dummy AGE_CAT HOMEKIDS_Dummy HASHOME_Dummy
## Min. :0.0000 35-54 :6121 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 55-59 : 725 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 30-34 : 649 Median :0.0000 Median :1.0000
## Mean :0.1202 60-64 : 265 Mean :0.3519 Mean :0.6621
## 3rd Qu.:0.0000 25-29 : 249 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 65-69 : 65 Max. :1.0000 Max. :1.0000
## (Other): 87
## Masters_Dummy PHD_Dummy CLM_FREQ_Dummy
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2032 Mean :0.0892 Mean :0.3862
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
knitr::kable(round(describe(train_df,na.rm = TRUE),2), format = "html", booktabs = T) %>% kable_styling(latex_options = c("striped", "scale_down"))
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TARGET_FLAG* | 1 | 8161 | 1.26 | 0.44 | 1 | 1.20 | 0.00 | 1 | 2.0 | 1.0 | 1.07 | -0.85 | 0.00 |
TARGET_AMT | 2 | 8161 | 1504.32 | 4704.03 | 0 | 593.71 | 0.00 | 0 | 107586.1 | 107586.1 | 8.71 | 112.29 | 52.07 |
AGE | 3 | 8161 | 44.79 | 8.62 | 45 | 44.83 | 8.90 | 16 | 81.0 | 65.0 | -0.03 | -0.06 | 0.10 |
YOJ | 4 | 8161 | 10.53 | 3.98 | 11 | 11.08 | 2.97 | 0 | 23.0 | 23.0 | -1.26 | 1.45 | 0.04 |
INCOME | 5 | 8161 | 58522.94 | 48345.51 | 51116 | 52958.35 | 44211.13 | 0 | 367030.0 | 367030.0 | 1.16 | 1.99 | 535.16 |
PARENT1* | 6 | 8161 | 1.13 | 0.34 | 1 | 1.04 | 0.00 | 1 | 2.0 | 1.0 | 2.17 | 2.73 | 0.00 |
MSTATUS* | 7 | 8161 | 1.40 | 0.49 | 1 | 1.38 | 0.00 | 1 | 2.0 | 1.0 | 0.41 | -1.83 | 0.01 |
SEX* | 8 | 8161 | 1.54 | 0.50 | 2 | 1.55 | 0.00 | 1 | 2.0 | 1.0 | -0.14 | -1.98 | 0.01 |
EDUCATION* | 9 | 8161 | NaN | NA | NA | NaN | NA | Inf | -Inf | -Inf | NA | NA | NA |
JOB* | 10 | 8161 | 5.43 | 2.76 | 6 | 5.53 | 2.97 | 1 | 9.0 | 8.0 | -0.21 | -1.17 | 0.03 |
TRAVTIME | 11 | 8161 | 33.49 | 15.91 | 33 | 33.00 | 16.31 | 5 | 142.0 | 137.0 | 0.45 | 0.66 | 0.18 |
CAR_USE* | 12 | 8161 | 1.63 | 0.48 | 2 | 1.66 | 0.00 | 1 | 2.0 | 1.0 | -0.53 | -1.72 | 0.01 |
BLUEBOOK | 13 | 8161 | 15709.90 | 8419.73 | 14440 | 15036.89 | 8450.82 | 1500 | 69740.0 | 68240.0 | 0.79 | 0.79 | 93.20 |
TIF | 14 | 8161 | 5.35 | 4.15 | 4 | 4.84 | 4.45 | 1 | 25.0 | 24.0 | 0.89 | 0.42 | 0.05 |
CAR_TYPE* | 15 | 8161 | 3.53 | 1.97 | 3 | 3.54 | 2.97 | 1 | 6.0 | 5.0 | 0.00 | -1.52 | 0.02 |
RED_CAR* | 16 | 8161 | 1.29 | 0.45 | 1 | 1.24 | 0.00 | 1 | 2.0 | 1.0 | 0.92 | -1.16 | 0.01 |
OLDCLAIM | 17 | 8161 | 4037.08 | 8777.14 | 0 | 1719.29 | 0.00 | 0 | 57037.0 | 57037.0 | 3.12 | 9.86 | 97.16 |
REVOKED* | 18 | 8161 | 1.12 | 0.33 | 1 | 1.03 | 0.00 | 1 | 2.0 | 1.0 | 2.30 | 3.30 | 0.00 |
MVR_PTS | 19 | 8161 | 1.70 | 2.15 | 1 | 1.31 | 1.48 | 0 | 13.0 | 13.0 | 1.35 | 1.38 | 0.02 |
CAR_AGE | 20 | 8161 | 7.81 | 5.88 | 8 | 7.41 | 7.41 | 0 | 28.0 | 28.0 | 0.33 | -0.81 | 0.07 |
URBANICITY* | 21 | 8161 | 1.20 | 0.40 | 1 | 1.13 | 0.00 | 1 | 2.0 | 1.0 | 1.46 | 0.15 | 0.00 |
KIDSDRIV_Dummy | 22 | 8161 | 0.12 | 0.33 | 0 | 0.03 | 0.00 | 0 | 1.0 | 1.0 | 2.34 | 3.45 | 0.00 |
AGE_CAT* | 23 | 8161 | 5.02 | 0.79 | 5 | 5.02 | 0.00 | 1 | 11.0 | 10.0 | 0.14 | 6.41 | 0.01 |
HOMEKIDS_Dummy | 24 | 8161 | 0.35 | 0.48 | 0 | 0.31 | 0.00 | 0 | 1.0 | 1.0 | 0.62 | -1.62 | 0.01 |
HASHOME_Dummy | 25 | 8161 | 0.66 | 0.47 | 1 | 0.70 | 0.00 | 0 | 1.0 | 1.0 | -0.69 | -1.53 | 0.01 |
Masters_Dummy | 26 | 8161 | 0.20 | 0.40 | 0 | 0.13 | 0.00 | 0 | 1.0 | 1.0 | 1.48 | 0.18 | 0.00 |
PHD_Dummy | 27 | 8161 | 0.09 | 0.29 | 0 | 0.00 | 0.00 | 0 | 1.0 | 1.0 | 2.88 | 6.31 | 0.00 |
CLM_FREQ_Dummy | 28 | 8161 | 0.39 | 0.49 | 0 | 0.36 | 0.00 | 0 | 1.0 | 1.0 | 0.47 | -1.78 | 0.01 |
There seems to be some high correlation between age and home kids, old claims and claim frequency, mvr pts and claim frequency, and car age and masters degree.
library("corrplot")
cor_mx = cor(dplyr::select_if(train_df, is.numeric) ,use="pairwise.complete.obs", method = "pearson")
corrplot(cor_mx, method = "color",
type = "upper", order = "original", number.cex = .7,
addCoef.col = "black", # Add coefficient of correlation
tl.col = "black", tl.srt = 90, # Text label color and rotation
# hide correlation coefficient on the principal diagonal
diag = TRUE)
library("PerformanceAnalytics")
Numericcols = as.data.frame(select_if(train_df, is.numeric))
chart.Correlation(Numericcols[,2:10])
chart.Correlation(Numericcols[,11:ncol(Numericcols)])
My first model is the SVMRadial model the model’s accuracy is 97%, and the kappa is 92.
control = trainControl(method="repeatedcv", number=3, repeats=1)
set.seed(7)
modelSvm = train(TARGET_FLAG~., data=train_df, method="svmRadial", trControl=control)
train_df$mypredictedSVM = predict(modelSvm,train_df)
M1 =confusionMatrix(factor(train_df$mypredictedSVM),factor(train_df$TARGET_FLAG),dnn = c("Prediction", "Reference"))
M1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6008 220
## 1 0 1933
##
## Accuracy : 0.973
## 95% CI : (0.9693, 0.9764)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9282
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.8978
## Pos Pred Value : 0.9647
## Neg Pred Value : 1.0000
## Prevalence : 0.7362
## Detection Rate : 0.7362
## Detection Prevalence : 0.7631
## Balanced Accuracy : 0.9489
##
## 'Positive' Class : 0
##
My first model is the SVMRadial with a BoxCox transformation model the model’s accuracy is 100%, and the kappa is 100.
control = trainControl(method="repeatedcv", number=3, repeats=1)
set.seed(7)
modelSvmBoxCox = train(TARGET_FLAG~., data=train_df, method="svmRadial", trControl=control,preProcess = "BoxCox")
train_df$modelSvmBoxCox = predict(modelSvmBoxCox,train_df)
M2= confusionMatrix(factor(train_df$modelSvmBoxCox),factor(train_df$TARGET_FLAG),dnn = c("Prediction", "Reference"))
M2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6008 220
## 1 0 1933
##
## Accuracy : 0.973
## 95% CI : (0.9693, 0.9764)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9282
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.8978
## Pos Pred Value : 0.9647
## Neg Pred Value : 1.0000
## Prevalence : 0.7362
## Detection Rate : 0.7362
## Detection Prevalence : 0.7631
## Balanced Accuracy : 0.9489
##
## 'Positive' Class : 0
##
summary(modelSvmBoxCox)
## Length Class Mode
## 1 ksvm S4
The GBM model was quite attractive due to the 100% accuracy and an AUC of 1. I am not entirely confident if the results are accurate. The reason this model was selected over the GLM was due to its ability to handle factors within the model. I did not try tweaking the model due to its high kappa value.
objControl = trainControl(method='repeatedcv', number=3, returnResamp='none', summaryFunction = twoClassSummary, classProbs = TRUE)
train_df$TARGET_FLAG =ifelse(train_df$TARGET_FLAG==1,'ACDNT','noACDNT')
train_df$TARGET_FLAG = as.factor(train_df$TARGET_FLAG)
set.seed(7)
modelgbm = train(TARGET_FLAG~., train_df,
method='gbm',
trControl=objControl,
metric = "ROC",
preProc = c("center", "scale"))
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9642 nan 0.1000 0.0959
## 2 0.8249 nan 0.1000 0.0709
## 3 0.7150 nan 0.1000 0.0556
## 4 0.6250 nan 0.1000 0.0447
## 5 0.5497 nan 0.1000 0.0377
## 6 0.4857 nan 0.1000 0.0319
## 7 0.4307 nan 0.1000 0.0280
## 8 0.3829 nan 0.1000 0.0237
## 9 0.3413 nan 0.1000 0.0208
## 10 0.3048 nan 0.1000 0.0181
## 20 0.1045 nan 0.1000 0.0058
## 40 0.0137 nan 0.1000 0.0007
## 60 0.0019 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0000 nan 0.1000 0.0000
## 120 0.0000 nan 0.1000 0.0000
## 140 0.0000 nan 0.1000 -0.0000
## 150 0.0000 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9642 nan 0.1000 0.0925
## 2 0.8249 nan 0.1000 0.0682
## 3 0.7151 nan 0.1000 0.0540
## 4 0.6251 nan 0.1000 0.0456
## 5 0.5497 nan 0.1000 0.0379
## 6 0.4858 nan 0.1000 0.0320
## 7 0.4307 nan 0.1000 0.0276
## 8 0.3831 nan 0.1000 0.0239
## 9 0.3415 nan 0.1000 0.0207
## 10 0.3049 nan 0.1000 0.0182
## 20 0.1045 nan 0.1000 0.0057
## 40 0.0137 nan 0.1000 0.0007
## 60 0.0019 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0000 nan 0.1000 0.0000
## 120 0.0000 nan 0.1000 0.0000
## 140 0.0000 nan 0.1000 0.0000
## 150 0.0000 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9642 nan 0.1000 0.0949
## 2 0.8248 nan 0.1000 0.0692
## 3 0.7149 nan 0.1000 0.0564
## 4 0.6251 nan 0.1000 0.0449
## 5 0.5497 nan 0.1000 0.0381
## 6 0.4857 nan 0.1000 0.0314
## 7 0.4307 nan 0.1000 0.0278
## 8 0.3830 nan 0.1000 0.0236
## 9 0.3414 nan 0.1000 0.0209
## 10 0.3049 nan 0.1000 0.0183
## 20 0.1045 nan 0.1000 0.0057
## 40 0.0137 nan 0.1000 0.0007
## 60 0.0019 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0000 nan 0.1000 0.0000
## 120 0.0000 nan 0.1000 0.0000
## 140 0.0000 nan 0.1000 -0.0000
## 150 0.0000 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9643 nan 0.1000 0.0945
## 2 0.8250 nan 0.1000 0.0702
## 3 0.7150 nan 0.1000 0.0544
## 4 0.6251 nan 0.1000 0.0450
## 5 0.5498 nan 0.1000 0.0374
## 6 0.4859 nan 0.1000 0.0319
## 7 0.4308 nan 0.1000 0.0273
## 8 0.3831 nan 0.1000 0.0239
## 9 0.3415 nan 0.1000 0.0208
## 10 0.3049 nan 0.1000 0.0182
## 20 0.1046 nan 0.1000 0.0057
## 40 0.0138 nan 0.1000 0.0007
## 60 0.0019 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0002 nan 0.1000 -0.0000
## 120 0.0001 nan 0.1000 -0.0000
## 140 0.0001 nan 0.1000 -0.0000
## 150 0.0001 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9646 nan 0.1000 0.0912
## 2 0.8251 nan 0.1000 0.0703
## 3 0.7151 nan 0.1000 0.0554
## 4 0.6251 nan 0.1000 0.0455
## 5 0.5499 nan 0.1000 0.0379
## 6 0.4859 nan 0.1000 0.0319
## 7 0.4308 nan 0.1000 0.0274
## 8 0.3832 nan 0.1000 0.0239
## 9 0.3416 nan 0.1000 0.0206
## 10 0.3050 nan 0.1000 0.0184
## 20 0.1046 nan 0.1000 0.0056
## 40 0.0139 nan 0.1000 0.0007
## 60 0.0020 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0001 nan 0.1000 0.0000
## 120 0.0000 nan 0.1000 0.0000
## 140 0.0000 nan 0.1000 -0.0000
## 150 0.0000 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9645 nan 0.1000 0.0924
## 2 0.8252 nan 0.1000 0.0691
## 3 0.7154 nan 0.1000 0.0535
## 4 0.6253 nan 0.1000 0.0456
## 5 0.5501 nan 0.1000 0.0373
## 6 0.4861 nan 0.1000 0.0315
## 7 0.4310 nan 0.1000 0.0279
## 8 0.3834 nan 0.1000 0.0236
## 9 0.3418 nan 0.1000 0.0211
## 10 0.3054 nan 0.1000 0.0183
## 20 0.1046 nan 0.1000 0.0058
## 40 0.0138 nan 0.1000 0.0007
## 60 0.0019 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0001 nan 0.1000 -0.0000
## 120 0.0000 nan 0.1000 0.0000
## 140 0.0000 nan 0.1000 -0.0000
## 150 0.0000 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9647 nan 0.1000 0.0938
## 2 0.8252 nan 0.1000 0.0698
## 3 0.7152 nan 0.1000 0.0552
## 4 0.6252 nan 0.1000 0.0451
## 5 0.5499 nan 0.1000 0.0373
## 6 0.4859 nan 0.1000 0.0315
## 7 0.4309 nan 0.1000 0.0277
## 8 0.3831 nan 0.1000 0.0238
## 9 0.3415 nan 0.1000 0.0209
## 10 0.3050 nan 0.1000 0.0185
## 20 0.1046 nan 0.1000 0.0057
## 40 0.0137 nan 0.1000 0.0007
## 60 0.0020 nan 0.1000 0.0001
## 80 0.0006 nan 0.1000 0.0000
## 100 0.0003 nan 0.1000 -0.0000
## 120 0.0002 nan 0.1000 0.0000
## 140 0.0001 nan 0.1000 0.0000
## 150 0.0001 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9647 nan 0.1000 0.0939
## 2 0.8252 nan 0.1000 0.0682
## 3 0.7152 nan 0.1000 0.0554
## 4 0.6252 nan 0.1000 0.0456
## 5 0.5499 nan 0.1000 0.0378
## 6 0.4859 nan 0.1000 0.0316
## 7 0.4309 nan 0.1000 0.0273
## 8 0.3831 nan 0.1000 0.0238
## 9 0.3415 nan 0.1000 0.0208
## 10 0.3050 nan 0.1000 0.0182
## 20 0.1046 nan 0.1000 0.0056
## 40 0.0138 nan 0.1000 0.0007
## 60 0.0019 nan 0.1000 0.0001
## 80 0.0003 nan 0.1000 0.0000
## 100 0.0001 nan 0.1000 0.0000
## 120 0.0001 nan 0.1000 -0.0000
## 140 0.0001 nan 0.1000 -0.0000
## 150 0.0001 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9647 nan 0.1000 0.0949
## 2 0.8252 nan 0.1000 0.0694
## 3 0.7154 nan 0.1000 0.0542
## 4 0.6253 nan 0.1000 0.0448
## 5 0.5501 nan 0.1000 0.0376
## 6 0.4861 nan 0.1000 0.0320
## 7 0.4310 nan 0.1000 0.0272
## 8 0.3832 nan 0.1000 0.0240
## 9 0.3415 nan 0.1000 0.0208
## 10 0.3051 nan 0.1000 0.0181
## 20 0.1048 nan 0.1000 0.0057
## 40 0.0139 nan 0.1000 0.0007
## 60 0.0020 nan 0.1000 0.0001
## 80 0.0004 nan 0.1000 0.0000
## 100 0.0001 nan 0.1000 -0.0000
## 120 0.0001 nan 0.1000 0.0000
## 140 0.0001 nan 0.1000 0.0000
## 150 0.0000 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9644 nan 0.1000 0.0931
## 2 0.8250 nan 0.1000 0.0695
## 3 0.7151 nan 0.1000 0.0552
## 4 0.6251 nan 0.1000 0.0446
## 5 0.5498 nan 0.1000 0.0373
## 6 0.4858 nan 0.1000 0.0318
## 7 0.4308 nan 0.1000 0.0277
## 8 0.3831 nan 0.1000 0.0239
## 9 0.3415 nan 0.1000 0.0208
## 10 0.3049 nan 0.1000 0.0183
## 20 0.1045 nan 0.1000 0.0057
## 40 0.0137 nan 0.1000 0.0007
## 50 0.0051 nan 0.1000 0.0003
train_df$mypredictedGBM = predict(modelgbm,train_df, type='raw')
trainPROBS = predict(modelgbm,train_df, type='prob')
M3=confusionMatrix(factor(train_df$mypredictedGBM),factor(train_df$TARGET_FLAG),dnn = c("Prediction", "Reference"))
M3
## Confusion Matrix and Statistics
##
## Reference
## Prediction ACDNT noACDNT
## ACDNT 2153 0
## noACDNT 0 6008
##
## Accuracy : 1
## 95% CI : (0.9995, 1)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.2638
## Detection Rate : 0.2638
## Detection Prevalence : 0.2638
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : ACDNT
##
MY_ROC(as.numeric(ifelse(train_df$TARGET_FLAG=="ACDNT",1,0)),trainPROBS$ACDNT,"Model2")
roc(train_df$TARGET_FLAG,trainPROBS$ACDNT)
##
## Call:
## roc.default(response = train_df$TARGET_FLAG, predictor = trainPROBS$ACDNT)
##
## Data: trainPROBS$ACDNT in 2153 controls (train_df$TARGET_FLAG ACDNT) > 6008 cases (train_df$TARGET_FLAG noACDNT).
## Area under the curve: 1
The first linear model I selected subsetted the data frame. Many of the dummy variables became statistically significant. Variables that significantly increase claim price are travel time, revoked license, sports car, and claim frequency. Additionally, I scaled the variable but there was little difference noticed. The model was check for high colinearity and there was none, the adjusted r square is .045 the RMSE is 4582.52.
control = trainControl(method="repeatedcv", number=3, repeats=1)
#train_df$TARGET_FLAG = ifelse(train_df$TARGET_FLAG=="ACDNT",1,0)
LM_df =train_df %>% select(TARGET_AMT,AGE,YOJ,INCOME, TRAVTIME,BLUEBOOK,OLDCLAIM,MVR_PTS,CAR_AGE,KIDSDRIV_Dummy,HOMEKIDS_Dummy,HASHOME_Dummy,Masters_Dummy,PHD_Dummy,CLM_FREQ_Dummy,SEX,CAR_USE,CAR_TYPE,RED_CAR,REVOKED)
set.seed(7)
modelLM1 = train(TARGET_AMT~., data=LM_df %>% select(-AGE,-YOJ,-RED_CAR,-PHD_Dummy,-Masters_Dummy,-BLUEBOOK,-SEX) , method="lm", trControl=control)
summary(modelLM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5235 -1645 -862 78 104074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.226e+03 2.203e+02 5.568 2.67e-08 ***
## INCOME -2.542e-03 1.176e-03 -2.162 0.030678 *
## TRAVTIME 6.690e+00 3.206e+00 2.087 0.036941 *
## OLDCLAIM -2.151e-02 8.044e-03 -2.674 0.007512 **
## MVR_PTS 1.869e+02 2.690e+01 6.948 4.00e-12 ***
## CAR_AGE -2.284e+01 9.349e+00 -2.443 0.014590 *
## KIDSDRIV_Dummy 4.987e+02 1.787e+02 2.791 0.005271 **
## HOMEKIDS_Dummy 3.121e+02 1.239e+02 2.520 0.011753 *
## HASHOME_Dummy -4.858e+02 1.087e+02 -4.469 7.97e-06 ***
## CLM_FREQ_Dummy 8.849e+02 1.445e+02 6.126 9.44e-10 ***
## CAR_USEPrivate -7.559e+02 1.264e+02 -5.980 2.32e-09 ***
## `CAR_TYPEPanel Truck` 4.632e+02 2.296e+02 2.018 0.043658 *
## CAR_TYPEPickup 3.084e+02 1.666e+02 1.852 0.064106 .
## `CAR_TYPESports Car` 6.597e+02 1.835e+02 3.596 0.000325 ***
## CAR_TYPEVan 5.811e+02 2.023e+02 2.873 0.004076 **
## CAR_TYPEz_SUV 4.064e+02 1.394e+02 2.915 0.003562 **
## REVOKEDYes 8.539e+02 1.761e+02 4.849 1.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4596 on 8144 degrees of freedom
## Multiple R-squared: 0.04718, Adjusted R-squared: 0.04531
## F-statistic: 25.21 on 16 and 8144 DF, p-value: < 2.2e-16
car::vif(modelLM1$finalModel)
## INCOME TRAVTIME OLDCLAIM
## 1.248982 1.004751 1.925567
## MVR_PTS CAR_AGE KIDSDRIV_Dummy
## 1.288589 1.165372 1.304497
## HOMEKIDS_Dummy HASHOME_Dummy CLM_FREQ_Dummy
## 1.351526 1.021534 1.911021
## CAR_USEPrivate `CAR_TYPEPanel Truck` CAR_TYPEPickup
## 1.440733 1.546657 1.513736
## `CAR_TYPESports Car` CAR_TYPEVan CAR_TYPEz_SUV
## 1.284338 1.318866 1.517278
## REVOKEDYes
## 1.288196
modelLM1$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 4582.52 0.04462884 2006.638 488.3737 0.00325737 94.59403
The second model I selected all variables.variables. The variables were scaled and center the results were similar the first model with an adjusted r squared of .045 and RMSE of 4583.656.
control = trainControl(method="repeatedcv", number=3, repeats=1)
set.seed(7)
modelLM2 = train(TARGET_AMT~., data=LM_df , method="lm", trControl=control,preProcess = c("center", "scale"))
#c("center", "scale")
modelLM2$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 4583.656 0.04419743 2013.002 487.1691 0.003022803 90.60408
summary(modelLM2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5196 -1648 -872 87 104110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1504.3246 50.8786 29.567 < 2e-16 ***
## AGE 0.3269 62.5428 0.005 0.995829
## YOJ -7.2604 55.0647 -0.132 0.895105
## INCOME -165.4941 65.8724 -2.512 0.012012 *
## TRAVTIME 108.8801 51.0544 2.133 0.032985 *
## BLUEBOOK 126.7672 73.0987 1.734 0.082922 .
## OLDCLAIM -189.2289 70.6250 -2.679 0.007391 **
## MVR_PTS 403.0278 57.8076 6.972 3.37e-12 ***
## CAR_AGE -165.6099 62.7880 -2.638 0.008365 **
## KIDSDRIV_Dummy 158.0869 59.2933 2.666 0.007687 **
## HOMEKIDS_Dummy 170.3104 69.4497 2.452 0.014216 *
## HASHOME_Dummy -231.2843 52.4660 -4.408 1.06e-05 ***
## Masters_Dummy 64.5269 64.4588 1.001 0.316829
## PHD_Dummy 46.5770 62.6910 0.743 0.457526
## CLM_FREQ_Dummy 429.2143 70.3952 6.097 1.13e-09 ***
## SEXz_F -182.5243 91.8808 -1.987 0.047007 *
## CAR_USEPrivate -376.3863 62.4016 -6.032 1.69e-09 ***
## `CAR_TYPEPanel Truck` 42.7397 72.8492 0.587 0.557430
## CAR_TYPEPickup 124.0336 63.2721 1.960 0.049993 *
## `CAR_TYPESports Car` 292.2436 69.1708 4.225 2.42e-05 ***
## CAR_TYPEVan 124.8957 60.9474 2.049 0.040471 *
## CAR_TYPEz_SUV 300.7093 81.4739 3.691 0.000225 ***
## RED_CARyes -13.6309 68.4483 -0.199 0.842157
## REVOKEDYes 278.4177 57.7800 4.819 1.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4596 on 8137 degrees of freedom
## Multiple R-squared: 0.04798, Adjusted R-squared: 0.04528
## F-statistic: 17.83 on 23 and 8137 DF, p-value: < 2.2e-16
car::vif(modelLM2$finalModel)
## AGE YOJ INCOME
## 1.510886 1.171177 1.676040
## TRAVTIME BLUEBOOK OLDCLAIM
## 1.006800 2.063935 1.926611
## MVR_PTS CAR_AGE KIDSDRIV_Dummy
## 1.290763 1.522754 1.357965
## HOMEKIDS_Dummy HASHOME_Dummy Masters_Dummy
## 1.863020 1.063243 1.604873
## PHD_Dummy CLM_FREQ_Dummy SEXz_F
## 1.518055 1.914091 3.260813
## CAR_USEPrivate `CAR_TYPEPanel Truck` CAR_TYPEPickup
## 1.504071 2.049871 1.546328
## `CAR_TYPESports Car` CAR_TYPEVan CAR_TYPEz_SUV
## 1.848087 1.434785 2.563974
## RED_CARyes REVOKEDYes
## 1.809684 1.289530
For the final model, i will choose model 1 for the binary logistic regression and model 1. I felt more confident using model1 in the binary logistic model. Model 3 was just perfect.
For the linear regression, model1 seems like the right choice. The model uses fewer varibles it’s statistically significant and has a sligly lower RMSE.
Binary_df =data.frame(M1$byClass)
Binary_df =cbind(Binary_df,M2$byClass)
Binary_df =cbind(Binary_df,M3$byClass)
colnames(Binary_df) = c("Model1","Model2","Model3")
knitr::kable(Binary_df, format = "latex", booktabs = T)
#linear
Linear_df = data.frame(modelLM1$results)
Linear_df =rbind(Linear_df,modelLM2$results)
row.names(Linear_df) = c("Model1","Model2")
knitr::kable(Linear_df, format = "html", booktabs = T)
intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD | |
---|---|---|---|---|---|---|---|
Model1 | TRUE | 4582.520 | 0.0446288 | 2006.638 | 488.3737 | 0.0032574 | 94.59403 |
Model2 | TRUE | 4583.656 | 0.0441974 | 2013.002 | 487.1691 | 0.0030228 | 90.60408 |
I applied all transformations from training df to the test data frame and predicted on the test data.
"KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
## [1] "KIDSDRIV will be converted into a dummy variable 1 has kids and 0 no driving kids"
test_df$KIDSDRIV_Dummy = ifelse(test_df$KIDSDRIV==0,0,1)
#The age seems to folow a normal dist
test_df$AGE[is.na(test_df$AGE)]= median(test_df$AGE,na.rm = TRUE)
#Created age groups based on IIHS
test_df$AGE_CAT = ifelse(test_df$AGE>19,ifelse(test_df$AGE>24,ifelse(test_df$AGE>29,ifelse(test_df$AGE>34,ifelse(test_df$AGE>54,ifelse(test_df$AGE>59,ifelse(test_df$AGE>64,ifelse(test_df$AGE>69,ifelse(test_df$AGE>74,ifelse(test_df$AGE>79,"80+","75-79"),"70-74"),"65-69"),"60-64"),"55-59"),"35-54"),"30-34"),"25-29"),"20-24"),"16-19")
test_df$AGE_CAT =factor(test_df$AGE_CAT,levels=c("16-19","20-24","25-29","30-34","35-54","55-59","60-64","65-69","70-74", "75-79","80+"
),ordered=TRUE)
"convert variable kids at home into a dummy variable"
## [1] "convert variable kids at home into a dummy variable"
test_df$HOMEKIDS_Dummy = ifelse(test_df$HOMEKIDS==0,0,1)
test_df$AGE[is.na(test_df$AGE)]= median(test_df$AGE,na.rm = TRUE)
test_df$YOJ[is.na(test_df$YOJ)]= median(test_df$YOJ,na.rm = TRUE)
#train_df$YOJ[train_df$YOJ==0]= 0
"cleans simbols from numeric variables"
## [1] "cleans simbols from numeric variables"
test_df$INCOME = as.numeric(gsub('\\$|,', '', test_df$INCOME))
test_df$INCOME[is.na(test_df$INCOME)]= 0
test_df$PARENT1 = factor(test_df$PARENT1)
test_df$HOME_VAL = as.numeric(gsub('\\$|,', '', test_df$HOME_VAL))
test_df$HOME_VAL[is.na(test_df$HOME_VAL)]= 0
test_df$HASHOME_Dummy = ifelse(test_df$HOME_VAL==0,0,1)
test_df$MSTATUS = factor(test_df$MSTATUS)
test_df$SEX = factor(test_df$SEX)
test_df$Masters_Dummy =ifelse(test_df$EDUCATION %in% c("Masters"), 1,0)
test_df$PHD_Dummy =ifelse(test_df$EDUCATION %in% c("PhD"), 1,0)
test_df$JOB[test_df$JOB==""]= "UNKNOWN"
test_df$JOB = factor(test_df$JOB)
test_df$CAR_USE = factor(test_df$CAR_USE)
test_df$BLUEBOOK = as.numeric(gsub('\\$|,', '', test_df$BLUEBOOK))
test_df$CAR_TYPE = factor(test_df$CAR_TYPE)
test_df$RED_CAR = factor(test_df$RED_CAR)
test_df$OLDCLAIM = as.numeric(gsub('\\$|,', '', test_df$OLDCLAIM))
test_df$CLM_FREQ_Dummy = ifelse(test_df$CLM_FREQ==0,0,1)
test_df$REVOKED = factor(test_df$REVOKED)
test_df$URBANICITY = factor(test_df$URBANICITY)
test_df$CAR_AGE[is.na(test_df$CAR_AGE)]= 0
test_df$CAR_AGE[test_df$CAR_AGE==-3]= 3
test_df = test_df %>% select(-KIDSDRIV,-HOMEKIDS,-CLM_FREQ,-HOME_VAL,-INDEX)
#test_df$TARGET_FLAG = factor(test_df$TARGET_FLAG)
#test_df$TARGET_AMT = NULL
#test_df$TARGET_FLAG = NULL
test_df$TARGET_AMT=as.numeric(test_df$TARGET_AMT)
test_df$TARGET_FLAG = as.factor(test_df$TARGET_FLAG)
test_df$TARGET_AMT = predict(modelLM1,test_df)
test_df$TARGET_FLAG = predict(modelSvm,test_df)
test_df$TARGET_AMT = ifelse(test_df$TARGET_FLAG==0,0, test_df$TARGET_AMT)
knitr::kable(head(test_df[1:2],10), format = "html", booktabs = T)
TARGET_FLAG | TARGET_AMT |
---|---|
0 | 0.000 |
1 | 2457.516 |
0 | 0.000 |
1 | 2294.668 |
0 | 0.000 |
1 | 2189.601 |
0 | 0.000 |
1 | 2234.380 |
0 | 0.000 |
0 | 0.000 |
write.csv(test_df[1:2],"predictions.csv")