Here, we read the training dataset into a dataframe.
df <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Insurance%20Model/insurance_training_data.csv")[-1]
head(df)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 0 0 0 60 0 11 $67,349 No $0
## 2 0 0 0 43 0 11 $91,449 No $257,252
## 3 0 0 0 35 1 10 $16,039 No $124,191
## 4 0 0 0 51 0 14 No $306,251
## 5 0 0 0 50 0 NA $114,986 No $243,925
## 6 1 2946 0 34 1 12 $125,301 Yes $0
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 1 z_No M PhD Professional 14 Private $14,230 11
## 2 z_No M z_High School z_Blue Collar 22 Commercial $14,940 1
## 3 Yes z_F z_High School Clerical 5 Private $4,010 4
## 4 Yes M <High School z_Blue Collar 32 Private $15,440 7
## 5 Yes z_F PhD Doctor 36 Private $18,000 1
## 6 z_No z_F Bachelors z_Blue Collar 46 Commercial $17,430 1
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 Minivan yes $4,461 2 No 3 18
## 2 Minivan yes $0 0 No 0 1
## 3 z_SUV no $38,690 2 No 3 10
## 4 Minivan yes $0 0 No 0 6
## 5 z_SUV no $19,217 2 Yes 3 17
## 6 Sports Car no $0 0 No 0 7
## URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Urban/ Urban
## 4 Highly Urban/ Urban
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban
df$INCOME <- as.numeric(df$INCOME)
df$HOME_VAL <- as.numeric(df$HOME_VAL)
df$BLUEBOOK <- as.numeric(df$BLUEBOOK)
df$OLDCLAIM <- as.numeric(df$OLDCLAIM)
First, we take a look at a summary of the data.
MSTATUS, HOME_VAL, SEX, EDUCTION, JOB, CAR_USE, RED_CAR, REVOKED, URBANICITY)summary(df)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE
## Min. :0.0000 Min. : 0 Min. :0.0000 Min. :16.00
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000 1st Qu.:39.00
## Median :0.0000 Median : 0 Median :0.0000 Median :45.00
## Mean :0.2638 Mean : 1504 Mean :0.1711 Mean :44.79
## 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:0.0000 3rd Qu.:51.00
## Max. :1.0000 Max. :107586 Max. :4.0000 Max. :81.00
## NA's :6
## HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## Min. :0.0000 Min. : 0.0 Min. : 1 No :7084 Min. : 1
## 1st Qu.:0.0000 1st Qu.: 9.0 1st Qu.: 926 Yes:1077 1st Qu.: 2
## Median :0.0000 Median :11.0 Median :2817 Median :1245
## Mean :0.7212 Mean :10.5 Mean :2876 Mean :1685
## 3rd Qu.:1.0000 3rd Qu.:13.0 3rd Qu.:4701 3rd Qu.:3164
## Max. :5.0000 Max. :23.0 Max. :6613 Max. :5107
## NA's :454
## MSTATUS SEX EDUCATION JOB
## Yes :4894 M :3786 <High School :1203 z_Blue Collar:1825
## z_No:3267 z_F:4375 Bachelors :2242 Clerical :1271
## Masters :1658 Professional :1117
## PhD : 728 Manager : 988
## z_High School:2330 Lawyer : 835
## Student : 712
## (Other) :1413
## TRAVTIME CAR_USE BLUEBOOK TIF
## Min. : 5.00 Commercial:3029 Min. : 1 Min. : 1.000
## 1st Qu.: 22.00 Private :5132 1st Qu.: 478 1st Qu.: 1.000
## Median : 33.00 Median :1124 Median : 4.000
## Mean : 33.49 Mean :1284 Mean : 5.351
## 3rd Qu.: 44.00 3rd Qu.:2234 3rd Qu.: 7.000
## Max. :142.00 Max. :2789 Max. :25.000
##
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED
## Minivan :2145 no :5783 Min. : 1.0 Min. :0.0000 No :7161
## Panel Truck: 676 yes:2378 1st Qu.: 1.0 1st Qu.:0.0000 Yes:1000
## Pickup :1389 Median : 1.0 Median :0.0000
## Sports Car : 907 Mean : 552.3 Mean :0.7986
## Van : 750 3rd Qu.:1015.0 3rd Qu.:2.0000
## z_SUV :2294 Max. :2857.0 Max. :5.0000
##
## MVR_PTS CAR_AGE URBANICITY
## Min. : 0.000 Min. :-3.000 Highly Urban/ Urban :6492
## 1st Qu.: 0.000 1st Qu.: 1.000 z_Highly Rural/ Rural:1669
## Median : 1.000 Median : 8.000
## Mean : 1.696 Mean : 8.328
## 3rd Qu.: 3.000 3rd Qu.:12.000
## Max. :13.000 Max. :28.000
## NA's :510
Taking a look a the distributions of numerical variables, the following items observations are revealed:
grid.arrange(ggplot(df, aes(TARGET_FLAG)) + geom_histogram(binwidth = .5),
ggplot(df, aes(TARGET_AMT)) + geom_histogram(binwidth = 1000),
ggplot(df, aes(KIDSDRIV)) + geom_histogram(binwidth = .1),
ggplot(df, aes(AGE)) + geom_histogram(binwidth = 10),
ggplot(df, aes(HOMEKIDS)) + geom_histogram(binwidth = .5),
ggplot(df, aes(YOJ)) + geom_histogram(binwidth = 1),
ggplot(df, aes(INCOME)) + geom_histogram(binwidth = 500),
ggplot(df, aes(HOME_VAL)) + geom_histogram(binwidth = 500),
ggplot(df, aes(TRAVTIME)) + geom_histogram(binwidth = 10),
ggplot(df, aes(BLUEBOOK)) + geom_histogram(binwidth = 200),
ggplot(df, aes(TIF)) + geom_histogram(binwidth = 5),
ggplot(df, aes(OLDCLAIM)) + geom_histogram(binwidth = 100),
ggplot(df, aes(MVR_PTS)) + geom_histogram(binwidth = 2),
ggplot(df, aes(CAR_AGE)) + geom_histogram(binwidth = 2),
ncol=4)
For the classification task, it might be insightful to compare distributions of numerical features for the levels of TARGET_FLAG. It appears that the features that differ the most between levels of TARGET_FLAG are HOME_VAL, OLDCLAIM, and MVR_PTS.
grid.arrange(ggplot(df, aes(x = TARGET_FLAG, y = KIDSDRIV, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = AGE, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = HOMEKIDS, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = YOJ, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = INCOME, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = HOME_VAL, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = TRAVTIME, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = BLUEBOOK, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = TIF, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = OLDCLAIM, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = MVR_PTS, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ggplot(df, aes(x = TARGET_FLAG, y = CAR_AGE, fill = as.factor(TARGET_FLAG))) + geom_boxplot() + theme(legend.position = "none") ,
ncol=4)
Looking at a correlation plot of numeric variables, it is evident that there is some collinearity in the dataset.
HOMEKIDS AND AGE have a negative correlationHOMEKIDS and KIDSDRIV have a positive correlationCLM_FREQ AND OLDCLAIM have a strong negative correlationMVR_PTS and OLDCLAIM have a negative correlationMVR_PTS and CLM_FREQ have a negative correlationcorrplot(cor(df[,sapply(df, is.numeric)], use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 5)
Based on information gathered by performing exploratory data analysis, we must impute missing values, encode categorical variables, and apply feature transformations.
We will use Multivariable Imputation by Chained Equations (mice) to fill the missing variables.
aggr(df[,sapply(df, is.numeric)], col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## CAR_USE 0.062492342
## YOJ 0.055630437
## AGE 0.000735204
## TARGET_FLAG 0.000000000
## TARGET_AMT 0.000000000
## KIDSDRIV 0.000000000
## HOMEKIDS 0.000000000
## INCOME 0.000000000
## PARENT1 0.000000000
## HOME_VAL 0.000000000
## MSTATUS 0.000000000
## SEX 0.000000000
## EDUCATION 0.000000000
## JOB 0.000000000
## TRAVTIME 0.000000000
The following categorical features have to be encoded: MSTATUS, HOME_VAL, SEX, EDUCTION, JOB, CAR_USE, RED_CAR, REVOKED, and URBANICITY. To do this, the dummyVars function from caret will be used.
dmy <- dummyVars(" ~ .", data = df)
df <- data.frame(predict(dmy, newdata = df))
Taking a look at the new columns in the dataframe, it is clear that some columns are unneccesary. Since each categorical feature requires one less column than categories, we will drop one dummy column for each feature.
names(df)
## [1] "TARGET_FLAG" "TARGET_AMT"
## [3] "KIDSDRIV" "AGE"
## [5] "HOMEKIDS" "YOJ"
## [7] "INCOME" "PARENT1.No"
## [9] "PARENT1.Yes" "HOME_VAL"
## [11] "MSTATUS.Yes" "MSTATUS.z_No"
## [13] "SEX.M" "SEX.z_F"
## [15] "EDUCATION..High.School" "EDUCATION.Bachelors"
## [17] "EDUCATION.Masters" "EDUCATION.PhD"
## [19] "EDUCATION.z_High.School" "JOB."
## [21] "JOB.Clerical" "JOB.Doctor"
## [23] "JOB.Home.Maker" "JOB.Lawyer"
## [25] "JOB.Manager" "JOB.Professional"
## [27] "JOB.Student" "JOB.z_Blue.Collar"
## [29] "TRAVTIME" "CAR_USE.Commercial"
## [31] "CAR_USE.Private" "BLUEBOOK"
## [33] "TIF" "CAR_TYPE.Minivan"
## [35] "CAR_TYPE.Panel.Truck" "CAR_TYPE.Pickup"
## [37] "CAR_TYPE.Sports.Car" "CAR_TYPE.Van"
## [39] "CAR_TYPE.z_SUV" "RED_CAR.no"
## [41] "RED_CAR.yes" "OLDCLAIM"
## [43] "CLM_FREQ" "REVOKED.No"
## [45] "REVOKED.Yes" "MVR_PTS"
## [47] "CAR_AGE" "URBANICITY.Highly.Urban..Urban"
## [49] "URBANICITY.z_Highly.Rural..Rural"
drop <- c("PARENT1.No", "MSTATUS.z_No", "SEX.M", "EDUCATION.z_High.School", "JOB.", "CAR_USE.Commercial", "CAR_TYPE.Pickup", "RED_CAR.no", "REVOKED.No", "URBANICITY.z_Highly.Rural..Rural")
df = df[,!(names(df) %in% drop)]
For the linear regression models, performance will be evaluated using R-squared and RMSE. However, for the binary logistic regression model, performance will also be measured based on test data accuracy. To accomplish this, we will create the following datasets. Train and test sets will be transformed separately.
set.seed(42)
inTrain <- sample(floor(0.8 * nrow(df)))
training <- df[inTrain, -(1:2)]
test <- df[-inTrain, -(1:2)]
train_y <- df[inTrain, (1:2)]
test_y <- df[-inTrain, (1:2)]
preProcValues <- preProcess(training, method = c("center", "scale"))
insurance_tf_train <- predict(preProcValues, training) %>% cbind(train_y)
insurance_tf_test <- predict(preProcValues, test) %>% cbind(test_y)
preProcValues_all <- preProcess(df[, -(1:2)], method = c("center", "scale"))
insurance_tf <- predict(preProcValues_all, df[, -(1:2)]) %>% cbind(df[, (1:2)])
write.csv(insurance_tf, "C:\\Users\\mkive\\Documents\\GitHub\\Business-Analytics-Data-Mining\\Business-Analytics-Data-Mining\\Insurance Model\\insurance_tf.csv")
write.csv(insurance_tf_train, "C:\\Users\\mkive\\Documents\\GitHub\\Business-Analytics-Data-Mining\\Business-Analytics-Data-Mining\\Insurance Model\\insurance_tf_train.csv")
write.csv(insurance_tf_test, "C:\\Users\\mkive\\Documents\\GitHub\\Business-Analytics-Data-Mining\\Business-Analytics-Data-Mining\\Insurance Model\\insurance_tf_test.csv")