In this homework assignment, we will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car this number will be a value greater than zero.
Our objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car.
Loading the libraries:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(tidyr)
library(pROC)
## Warning: package 'pROC' was built under R version 3.6.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Loading the data, and exploring the data:
insurance_raw <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/HW-4/Data/insurance_training_data.csv",
header = TRUE, row.names = 1)
print(head(insurance_raw))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1 0 0 0 60 0 11 $67,349 No
## 2 0 0 0 43 0 11 $91,449 No
## 4 0 0 0 35 1 10 $16,039 No
## 5 0 0 0 51 0 14 No
## 6 0 0 0 50 0 NA $114,986 No
## 7 1 2946 0 34 1 12 $125,301 Yes
## HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE
## 1 $0 z_No M PhD Professional 14 Private
## 2 $257,252 z_No M z_High School z_Blue Collar 22 Commercial
## 4 $124,191 Yes z_F z_High School Clerical 5 Private
## 5 $306,251 Yes M <High School z_Blue Collar 32 Private
## 6 $243,925 Yes z_F PhD Doctor 36 Private
## 7 $0 z_No z_F Bachelors z_Blue Collar 46 Commercial
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 1 $14,230 11 Minivan yes $4,461 2 No 3
## 2 $14,940 1 Minivan yes $0 0 No 0
## 4 $4,010 4 z_SUV no $38,690 2 No 3
## 5 $15,440 7 Minivan yes $0 0 No 0
## 6 $18,000 1 z_SUV no $19,217 2 Yes 3
## 7 $17,430 1 Sports Car no $0 0 No 0
## CAR_AGE URBANICITY
## 1 18 Highly Urban/ Urban
## 2 1 Highly Urban/ Urban
## 4 10 Highly Urban/ Urban
## 5 6 Highly Urban/ Urban
## 6 17 Highly Urban/ Urban
## 7 7 Highly Urban/ Urban
print(dim(insurance_raw))
## [1] 8161 25
print(str(insurance_raw))
## 'data.frame': 8161 obs. of 25 variables:
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : int 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : int 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : Factor w/ 6613 levels "","$0","$1,007",..: 5033 6292 1250 1 509 746 1488 315 4765 282 ...
## $ PARENT1 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
## $ HOME_VAL : Factor w/ 5107 levels "","$0","$100,093",..: 2 3259 348 3917 3034 2 1 4167 2 2 ...
## $ MSTATUS : Factor w/ 2 levels "Yes","z_No": 2 2 1 1 1 2 1 1 2 2 ...
## $ SEX : Factor w/ 2 levels "M","z_F": 1 1 2 1 2 2 2 1 2 1 ...
## $ EDUCATION : Factor w/ 5 levels "<High School",..: 4 5 5 1 4 2 1 2 2 2 ...
## $ JOB : Factor w/ 9 levels "","Clerical",..: 7 9 2 9 3 9 9 9 2 7 ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : Factor w/ 2 levels "Commercial","Private": 2 1 2 2 2 1 2 1 2 1 ...
## $ BLUEBOOK : Factor w/ 2789 levels "$1,500","$1,520",..: 434 503 2212 553 802 746 2672 701 135 852 ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : Factor w/ 6 levels "Minivan","Panel Truck",..: 1 1 6 1 6 4 6 5 6 5 ...
## $ RED_CAR : Factor w/ 2 levels "no","yes": 2 2 1 2 1 1 1 2 1 1 ...
## $ OLDCLAIM : Factor w/ 2857 levels "$0","$1,000",..: 1449 1 1311 1 432 1 1 510 1 1 ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 1 1 ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : Factor w/ 2 levels "Highly Urban/ Urban",..: 1 1 1 1 1 1 1 1 1 2 ...
## NULL
print(summary(insurance_raw))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE
## Min. :0.0000 Min. : 0 Min. :0.0000 Min. :16.00
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000 1st Qu.:39.00
## Median :0.0000 Median : 0 Median :0.0000 Median :45.00
## Mean :0.2638 Mean : 1504 Mean :0.1711 Mean :44.79
## 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:0.0000 3rd Qu.:51.00
## Max. :1.0000 Max. :107586 Max. :4.0000 Max. :81.00
## NA's :6
## HOMEKIDS YOJ INCOME PARENT1
## Min. :0.0000 Min. : 0.0 $0 : 615 No :7084
## 1st Qu.:0.0000 1st Qu.: 9.0 : 445 Yes:1077
## Median :0.0000 Median :11.0 $26,840 : 4
## Mean :0.7212 Mean :10.5 $48,509 : 4
## 3rd Qu.:1.0000 3rd Qu.:13.0 $61,790 : 4
## Max. :5.0000 Max. :23.0 $107,375: 3
## NA's :454 (Other) :7086
## HOME_VAL MSTATUS SEX EDUCATION
## $0 :2294 Yes :4894 M :3786 <High School :1203
## : 464 z_No:3267 z_F:4375 Bachelors :2242
## $111,129: 3 Masters :1658
## $115,249: 3 PhD : 728
## $123,109: 3 z_High School:2330
## $153,061: 3
## (Other) :5391
## JOB TRAVTIME CAR_USE BLUEBOOK
## z_Blue Collar:1825 Min. : 5.00 Commercial:3029 $1,500 : 157
## Clerical :1271 1st Qu.: 22.00 Private :5132 $6,000 : 34
## Professional :1117 Median : 33.00 $5,800 : 33
## Manager : 988 Mean : 33.49 $6,200 : 33
## Lawyer : 835 3rd Qu.: 44.00 $6,400 : 31
## Student : 712 Max. :142.00 $5,900 : 30
## (Other) :1413 (Other):7843
## TIF CAR_TYPE RED_CAR OLDCLAIM
## Min. : 1.000 Minivan :2145 no :5783 $0 :5009
## 1st Qu.: 1.000 Panel Truck: 676 yes:2378 $1,310 : 4
## Median : 4.000 Pickup :1389 $1,391 : 4
## Mean : 5.351 Sports Car : 907 $4,263 : 4
## 3rd Qu.: 7.000 Van : 750 $1,105 : 3
## Max. :25.000 z_SUV :2294 $1,332 : 3
## (Other):3134
## CLM_FREQ REVOKED MVR_PTS CAR_AGE
## Min. :0.0000 No :7161 Min. : 0.000 Min. :-3.000
## 1st Qu.:0.0000 Yes:1000 1st Qu.: 0.000 1st Qu.: 1.000
## Median :0.0000 Median : 1.000 Median : 8.000
## Mean :0.7986 Mean : 1.696 Mean : 8.328
## 3rd Qu.:2.0000 3rd Qu.: 3.000 3rd Qu.:12.000
## Max. :5.0000 Max. :13.000 Max. :28.000
## NA's :510
## URBANICITY
## Highly Urban/ Urban :6492
## z_Highly Rural/ Rural:1669
##
##
##
##
##
insurance_raw$TARGET_FLAG <- as.factor(insurance_raw$TARGET_FLAG)
ggplot(insurance_raw, aes(TARGET_FLAG)) + geom_bar(aes(fill=TARGET_FLAG))
print(table(insurance_raw$TARGET_FLAG))
##
## 0 1
## 6008 2153
As we see above the people which have claimed (target flag = 1) are 2153 in number and who have not claimed (target flag = 0) are 6008. So it is an imbalanced dataset.
### Blank data
train_missing_df <- data.frame(apply(insurance_raw, 2, function(x) length(which(x == ''))))
print(train_missing_df)
## apply.insurance_raw..2..function.x..length.which.x.........
## TARGET_FLAG 0
## TARGET_AMT 0
## KIDSDRIV 0
## AGE 0
## HOMEKIDS 0
## YOJ 0
## INCOME 445
## PARENT1 0
## HOME_VAL 464
## MSTATUS 0
## SEX 0
## EDUCATION 0
## JOB 526
## TRAVTIME 0
## CAR_USE 0
## BLUEBOOK 0
## TIF 0
## CAR_TYPE 0
## RED_CAR 0
## OLDCLAIM 0
## CLM_FREQ 0
## REVOKED 0
## MVR_PTS 0
## CAR_AGE 0
## URBANICITY 0
### NA data
train_na_df1 <- data.frame(apply(insurance_raw, 2, function(x) length(which(is.na(x)))))
print(train_na_df1)
## apply.insurance_raw..2..function.x..length.which.is.na.x....
## TARGET_FLAG 0
## TARGET_AMT 0
## KIDSDRIV 0
## AGE 6
## HOMEKIDS 0
## YOJ 454
## INCOME 0
## PARENT1 0
## HOME_VAL 0
## MSTATUS 0
## SEX 0
## EDUCATION 0
## JOB 0
## TRAVTIME 0
## CAR_USE 0
## BLUEBOOK 0
## TIF 0
## CAR_TYPE 0
## RED_CAR 0
## OLDCLAIM 0
## CLM_FREQ 0
## REVOKED 0
## MVR_PTS 0
## CAR_AGE 510
## URBANICITY 0
print(unique(insurance_raw$KIDSDRIV))
## [1] 0 1 2 3 4
ggplot(insurance_raw, aes(KIDSDRIV)) + geom_bar(aes(fill=KIDSDRIV))
insurance_raw[insurance_raw$KIDSDRIV != 0,] %>% ggplot(aes(KIDSDRIV)) +
geom_bar(aes(fill=KIDSDRIV))
Adults_only_policies <- insurance_raw[insurance_raw$KIDSDRIV == 0,]
with_kids_policies <- insurance_raw[insurance_raw$KIDSDRIV != 0,]
table(Adults_only_policies$TARGET_FLAG) / nrow(Adults_only_policies)
##
## 0 1
## 0.7530641 0.2469359
table(with_kids_policies$TARGET_FLAG) / nrow(with_kids_policies)
##
## 0 1
## 0.6126402 0.3873598
### Age
ggplot(insurance_raw, aes(AGE)) + geom_histogram(binwidth = 5)
## Warning: Removed 6 rows containing non-finite values (stat_bin).
ggplot(insurance_raw, aes(AGE)) + geom_histogram(binwidth = 5) +
facet_grid(~TARGET_FLAG)
## Warning: Removed 6 rows containing non-finite values (stat_bin).
ggplot(insurance_raw, aes(x = AGE, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
### homekids
unique(insurance_raw$HOMEKIDS)
## [1] 0 1 2 3 4 5
kids_0_insurance <- insurance_raw[insurance_raw$HOMEKIDS == 0,]
kids_non0_insurance <- insurance_raw[insurance_raw$HOMEKIDS != 0,]
kids_0_insurance$TARGET_FLAG <- as.integer(as.character(kids_0_insurance$TARGET_FLAG))
kids_non0_insurance$TARGET_FLAG <- as.integer(as.character(kids_non0_insurance$TARGET_FLAG))
ggplot(kids_0_insurance, aes(TARGET_FLAG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(kids_non0_insurance, aes(TARGET_FLAG)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(kids_0_insurance$TARGET_FLAG) / nrow(kids_0_insurance)
##
## 0 1
## 0.7782189 0.2217811
table(kids_non0_insurance$TARGET_FLAG) / nrow(kids_non0_insurance)
##
## 0 1
## 0.6587744 0.3412256
ggplot(insurance_raw, aes(x = HOMEKIDS, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Years on job
ggplot(data = insurance_raw, aes(x = YOJ, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
## Warning: Removed 454 rows containing missing values (geom_point).
## Warning: Removed 454 rows containing missing values (geom_point).
ggplot(insurance_raw, aes(x = YOJ, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 454 rows containing non-finite values (stat_bin).
### INCOME
any(is.na(insurance_raw$INCOME))
## [1] FALSE
any(insurance_raw$INCOME == '')
## [1] TRUE
insurance_raw$INCOME[insurance_raw$INCOME == ''] <- NA
sum(is.na(insurance_raw$INCOME))
## [1] 445
insurance_raw$INCOME <- str_remove_all(insurance_raw$INCOME, "[$,]") %>% as.integer()
ggplot(insurance_raw, aes(INCOME)) + geom_histogram() +
facet_grid(~TARGET_FLAG)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 445 rows containing non-finite values (stat_bin).
ggplot(data = insurance_raw, aes(x = INCOME, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
## Warning: Removed 445 rows containing missing values (geom_point).
## Warning: Removed 445 rows containing missing values (geom_point).
### PARENT1
xtabs(~insurance_raw$PARENT1 + insurance_raw$TARGET_FLAG)
## insurance_raw$TARGET_FLAG
## insurance_raw$PARENT1 0 1
## No 5407 1677
## Yes 601 476
sum(is.na(insurance_raw$HOME_VAL))
## [1] 0
insurance_raw$HOME_VAL[insurance_raw$HOME_VAL == ''] <- NA
sum(is.na(insurance_raw$HOME_VAL))
## [1] 464
insurance_raw$HOME_VAL <- str_remove_all(insurance_raw$HOME_VAL, "[$,]") %>% as.integer()
ggplot(insurance_raw, aes(HOME_VAL)) + geom_histogram() +
facet_grid(~TARGET_FLAG)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 464 rows containing non-finite values (stat_bin).
ggplot(data = insurance_raw, aes(x = HOME_VAL, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
## Warning: Removed 464 rows containing missing values (geom_point).
## Warning: Removed 464 rows containing missing values (geom_point).
unique(insurance_raw$MSTATUS)
## [1] z_No Yes
## Levels: Yes z_No
ggplot(data = insurance_raw, aes(x = MSTATUS)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
insurance_raw %>%
ggplot(aes(MSTATUS)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
ggplot(data = insurance_raw, aes(x = SEX)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
unique(insurance_raw$EDUCATION)
## [1] PhD z_High School <High School Bachelors Masters
## Levels: <High School Bachelors Masters PhD z_High School
ggplot(data = insurance_raw, aes(x = EDUCATION)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
unique(insurance_raw$JOB)
## [1] Professional z_Blue Collar Clerical Doctor Lawyer
## [6] Manager Home Maker Student
## 9 Levels: Clerical Doctor Home Maker Lawyer Manager ... z_Blue Collar
insurance_raw$JOB[insurance_raw$JOB == ''] <- NA
ggplot(data = insurance_raw, aes(x = JOB)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
ggplot(insurance_raw, aes(x = TRAVTIME, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = insurance_raw, aes(x = TRAVTIME, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
unique(insurance_raw$CAR_USE)
## [1] Private Commercial
## Levels: Commercial Private
ggplot(data = insurance_raw, aes(x = CAR_USE)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
sum(insurance_raw$BLUEBOOK == '')
## [1] 0
insurance_raw$BLUEBOOK <- str_remove_all(insurance_raw$BLUEBOOK, "[$,]") %>% as.integer()
#ggplot(insurance_raw, aes(BLUEBOOK)) + geom_histogram() +
# facet_grid(~TARGET_FLAG)
#ggplot(insurance_raw, aes(x = BLUEBOOK, group = TARGET_FLAG, fill = TARGET_FLAG)) +
# geom_histogram()
ggplot(insurance_raw, aes(x = BLUEBOOK, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = insurance_raw, aes(x = BLUEBOOK, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
ggplot(insurance_raw, aes(x = TIF, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = insurance_raw, aes(x = TIF, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
unique(insurance_raw$CAR_TYPE)
## [1] Minivan z_SUV Sports Car Van Panel Truck Pickup
## Levels: Minivan Panel Truck Pickup Sports Car Van z_SUV
ggplot(data = insurance_raw, aes(x = CAR_TYPE)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
unique(insurance_raw$RED_CAR)
## [1] yes no
## Levels: no yes
ggplot(data = insurance_raw, aes(x = RED_CAR)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
RED_CAR_xtabs <- xtabs(~insurance_raw$RED_CAR + insurance_raw$TARGET_FLAG)
print(RED_CAR_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$RED_CAR 0 1
## no 4246 1537
## yes 1762 616
RED_CAR_xtabs[1,] <- RED_CAR_xtabs[1,] / sum(RED_CAR_xtabs[1,])
RED_CAR_xtabs[2,] <- RED_CAR_xtabs[2,] / sum(RED_CAR_xtabs[2,])
print(RED_CAR_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$RED_CAR 0 1
## no 0.7342210 0.2657790
## yes 0.7409588 0.2590412
#####OLDCLAIM
sum(insurance_raw$OLDCLAIM == '')
## [1] 0
insurance_raw$OLDCLAIM <- str_remove_all(insurance_raw$OLDCLAIM, "[$,]") %>% as.integer()
ggplot(insurance_raw, aes(x = OLDCLAIM, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = insurance_raw, aes(x = OLDCLAIM, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
ggplot(data = insurance_raw, aes(x = OLDCLAIM, y = TARGET_AMT)) + geom_point()
### CLM_FREQ
unique(insurance_raw$CLM_FREQ)
## [1] 2 0 1 3 5 4
ggplot(data = insurance_raw, aes(x = CLM_FREQ)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
CLM_FREQ_xtabs <- xtabs(~insurance_raw$CLM_FREQ + insurance_raw$TARGET_FLAG)
print(CLM_FREQ_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$CLM_FREQ 0 1
## 0 4111 898
## 1 612 385
## 2 702 469
## 3 462 314
## 4 110 80
## 5 11 7
CLM_FREQ_xtabs[1,] <- CLM_FREQ_xtabs[1,] / sum(CLM_FREQ_xtabs[1,])
CLM_FREQ_xtabs[2,] <- CLM_FREQ_xtabs[2,] / sum(CLM_FREQ_xtabs[2,])
CLM_FREQ_xtabs[3,] <- CLM_FREQ_xtabs[3,] / sum(CLM_FREQ_xtabs[3,])
CLM_FREQ_xtabs[4,] <- CLM_FREQ_xtabs[4,] / sum(CLM_FREQ_xtabs[4,])
CLM_FREQ_xtabs[5,] <- CLM_FREQ_xtabs[5,] / sum(CLM_FREQ_xtabs[5,])
CLM_FREQ_xtabs[6,] <- CLM_FREQ_xtabs[6,] / sum(CLM_FREQ_xtabs[6,])
print(CLM_FREQ_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$CLM_FREQ 0 1
## 0 0.8207227 0.1792773
## 1 0.6138415 0.3861585
## 2 0.5994876 0.4005124
## 3 0.5953608 0.4046392
## 4 0.5789474 0.4210526
## 5 0.6111111 0.3888889
### REVOKED
unique(insurance_raw$REVOKED)
## [1] No Yes
## Levels: No Yes
ggplot(data = insurance_raw, aes(x = REVOKED)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
REVOKED_xtabs <- xtabs(~insurance_raw$REVOKED + insurance_raw$TARGET_FLAG)
print(REVOKED_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$REVOKED 0 1
## No 5451 1710
## Yes 557 443
REVOKED_xtabs[1,] <- REVOKED_xtabs[1,] / sum(REVOKED_xtabs[1,])
REVOKED_xtabs[2,] <- REVOKED_xtabs[2,] / sum(REVOKED_xtabs[2,])
print(REVOKED_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$REVOKED 0 1
## No 0.7612065 0.2387935
## Yes 0.5570000 0.4430000
print(summary(insurance_raw$MVR_PTS))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.696 3.000 13.000
print(unique(insurance_raw$MVR_PTS))
## [1] 3 0 10 1 5 2 4 13 6 7 8 9 11
ggplot(insurance_raw, aes(x = MVR_PTS, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = insurance_raw, aes(x = MVR_PTS, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
summary(insurance_raw$CAR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.000 1.000 8.000 8.328 12.000 28.000 510
ggplot(insurance_raw, aes(x = CAR_AGE, group = TARGET_FLAG, fill = TARGET_FLAG)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 510 rows containing non-finite values (stat_bin).
ggplot(data = insurance_raw, aes(x = CAR_AGE, y = TARGET_FLAG)) + geom_point() +
geom_jitter()
## Warning: Removed 510 rows containing missing values (geom_point).
## Warning: Removed 510 rows containing missing values (geom_point).
### urbanicity
print(summary(insurance_raw$URBANICITY))
## Highly Urban/ Urban z_Highly Rural/ Rural
## 6492 1669
print(unique(insurance_raw$URBANICITY))
## [1] Highly Urban/ Urban z_Highly Rural/ Rural
## Levels: Highly Urban/ Urban z_Highly Rural/ Rural
ggplot(data = insurance_raw, aes(x = URBANICITY)) +
geom_bar(aes(fill = TARGET_FLAG), position = position_dodge())
URBANICITY_xtabs <- xtabs(~insurance_raw$URBANICITY + insurance_raw$TARGET_FLAG)
print(URBANICITY_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$URBANICITY 0 1
## Highly Urban/ Urban 4454 2038
## z_Highly Rural/ Rural 1554 115
URBANICITY_xtabs[1,] <- URBANICITY_xtabs[1,] / sum(URBANICITY_xtabs[1,])
URBANICITY_xtabs[2,] <- URBANICITY_xtabs[2,] / sum(URBANICITY_xtabs[2,])
print(URBANICITY_xtabs)
## insurance_raw$TARGET_FLAG
## insurance_raw$URBANICITY 0 1
## Highly Urban/ Urban 0.68607517 0.31392483
## z_Highly Rural/ Rural 0.93109646 0.06890354
We will impute only the numerical missing data. We also have 1 feature which is categorical and missing some data. We will not impute the missing categorical data - feature JOB, and instead we will drop the records having NA in this feature. Rest all missing data is the numerical field data, and we are using the mice package below to impute it.
#sum(!complete.cases(insurance_raw))
summary(insurance_raw$YOJ)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 9.0 11.0 10.5 13.0 23.0 454
summary(insurance_raw$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16.00 39.00 45.00 44.79 51.00 81.00 6
summary(insurance_raw$INCOME)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 28097 54028 61898 85986 367030 445
summary(insurance_raw$HOME_VAL)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 0 161160 154867 238724 885282 464
summary(insurance_raw$JOB)
## Clerical Doctor Home Maker Lawyer
## 0 1271 246 641 835
## Manager Professional Student z_Blue Collar NA's
## 988 1117 712 1825 526
summary(insurance_raw$CAR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.000 1.000 8.000 8.328 12.000 28.000 510
md.pattern(insurance_raw)
## TARGET_FLAG TARGET_AMT KIDSDRIV HOMEKIDS PARENT1 MSTATUS SEX
## 6045 1 1 1 1 1 1 1
## 403 1 1 1 1 1 1 1
## 397 1 1 1 1 1 1 1
## 34 1 1 1 1 1 1 1
## 346 1 1 1 1 1 1 1
## 32 1 1 1 1 1 1 1
## 26 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 357 1 1 1 1 1 1 1
## 28 1 1 1 1 1 1 1
## 17 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 20 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 342 1 1 1 1 1 1 1
## 22 1 1 1 1 1 1 1
## 23 1 1 1 1 1 1 1
## 21 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1
## 22 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## EDUCATION TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM
## 6045 1 1 1 1 1 1 1 1
## 403 1 1 1 1 1 1 1 1
## 397 1 1 1 1 1 1 1 1
## 34 1 1 1 1 1 1 1 1
## 346 1 1 1 1 1 1 1 1
## 32 1 1 1 1 1 1 1 1
## 26 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1
## 357 1 1 1 1 1 1 1 1
## 28 1 1 1 1 1 1 1 1
## 17 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 20 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 342 1 1 1 1 1 1 1 1
## 22 1 1 1 1 1 1 1 1
## 23 1 1 1 1 1 1 1 1
## 21 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1
## 22 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## CLM_FREQ REVOKED MVR_PTS URBANICITY AGE INCOME YOJ HOME_VAL CAR_AGE
## 6045 1 1 1 1 1 1 1 1 1
## 403 1 1 1 1 1 1 1 1 1
## 397 1 1 1 1 1 1 1 1 0
## 34 1 1 1 1 1 1 1 1 0
## 346 1 1 1 1 1 1 1 0 1
## 32 1 1 1 1 1 1 1 0 1
## 26 1 1 1 1 1 1 1 0 0
## 3 1 1 1 1 1 1 1 0 0
## 357 1 1 1 1 1 1 0 1 1
## 28 1 1 1 1 1 1 0 1 1
## 17 1 1 1 1 1 1 0 1 0
## 1 1 1 1 1 1 1 0 1 0
## 20 1 1 1 1 1 1 0 0 1
## 1 1 1 1 1 1 1 0 0 1
## 1 1 1 1 1 1 1 0 0 0
## 342 1 1 1 1 1 0 1 1 1
## 22 1 1 1 1 1 0 1 1 1
## 23 1 1 1 1 1 0 1 1 0
## 21 1 1 1 1 1 0 1 0 1
## 2 1 1 1 1 1 0 1 0 1
## 5 1 1 1 1 1 0 1 0 0
## 22 1 1 1 1 1 0 0 1 1
## 2 1 1 1 1 1 0 0 1 0
## 4 1 1 1 1 1 0 0 0 1
## 1 1 1 1 1 1 0 0 0 0
## 3 1 1 1 1 0 1 1 1 1
## 2 1 1 1 1 0 1 1 0 1
## 1 1 1 1 1 0 0 1 1 1
## 0 0 0 0 6 445 454 464 510
## JOB
## 6045 1 0
## 403 0 1
## 397 1 1
## 34 0 2
## 346 1 1
## 32 0 2
## 26 1 2
## 3 0 3
## 357 1 1
## 28 0 2
## 17 1 2
## 1 0 3
## 20 1 2
## 1 0 3
## 1 1 3
## 342 1 1
## 22 0 2
## 23 1 2
## 21 1 2
## 2 0 3
## 5 1 3
## 22 1 2
## 2 1 3
## 4 1 3
## 1 1 4
## 3 1 1
## 2 1 2
## 1 1 2
## 526 2405
insurance_raw2 <- insurance_raw %>% drop_na(JOB)
aggr_plot <- aggr(insurance_raw2, col=c('navyblue','red'),
numbers=TRUE, sortVars=TRUE,
labels=names(insurance_raw), cex.axis=.7,
gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## CAR_AGE 0.0618205632
## HOME_VAL 0.0557956778
## YOJ 0.0555337263
## INCOME 0.0551407990
## AGE 0.0007858546
## TARGET_FLAG 0.0000000000
## TARGET_AMT 0.0000000000
## KIDSDRIV 0.0000000000
## HOMEKIDS 0.0000000000
## PARENT1 0.0000000000
## MSTATUS 0.0000000000
## SEX 0.0000000000
## EDUCATION 0.0000000000
## JOB 0.0000000000
## TRAVTIME 0.0000000000
## CAR_USE 0.0000000000
## BLUEBOOK 0.0000000000
## TIF 0.0000000000
## CAR_TYPE 0.0000000000
## RED_CAR 0.0000000000
## OLDCLAIM 0.0000000000
## CLM_FREQ 0.0000000000
## REVOKED 0.0000000000
## MVR_PTS 0.0000000000
## URBANICITY 0.0000000000
insurance_raw_imputed <- mice(data = insurance_raw2, m = 1,
method = "pmm", maxit = 5, seed = 500)
##
## iter imp variable
## 1 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 2 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 3 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 4 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 5 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## Warning: Number of logged events: 25
insurance_raw_imputed_df <- mice::complete(insurance_raw_imputed, 1)
head(insurance_raw_imputed_df)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 0 0 0 60 0 11 67349 No 0
## 2 0 0 0 43 0 11 91449 No 257252
## 3 0 0 0 35 1 10 16039 No 124191
## 4 0 0 0 51 0 14 71518 No 306251
## 5 0 0 0 50 0 8 114986 No 243925
## 6 1 2946 0 34 1 12 125301 Yes 0
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 1 z_No M PhD Professional 14 Private 14230 11
## 2 z_No M z_High School z_Blue Collar 22 Commercial 14940 1
## 3 Yes z_F z_High School Clerical 5 Private 4010 4
## 4 Yes M <High School z_Blue Collar 32 Private 15440 7
## 5 Yes z_F PhD Doctor 36 Private 18000 1
## 6 z_No z_F Bachelors z_Blue Collar 46 Commercial 17430 1
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 Minivan yes 4461 2 No 3 18
## 2 Minivan yes 0 0 No 0 1
## 3 z_SUV no 38690 2 No 3 10
## 4 Minivan yes 0 0 No 0 6
## 5 z_SUV no 19217 2 Yes 3 17
## 6 Sports Car no 0 0 No 0 7
## URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Urban/ Urban
## 4 Highly Urban/ Urban
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban
summary(insurance_raw$YOJ)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 9.0 11.0 10.5 13.0 23.0 454
summary(insurance_raw_imputed_df$YOJ)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 11.00 10.44 13.00 23.00
summary(insurance_raw$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16.00 39.00 45.00 44.79 51.00 81.00 6
summary(insurance_raw_imputed_df$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 39.00 45.00 44.66 51.00 81.00
summary(insurance_raw$INCOME)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 28097 54028 61898 85986 367030 445
summary(insurance_raw_imputed_df$INCOME)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 26113 51121 57588 80292 367030
summary(insurance_raw$HOME_VAL)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 0 161160 154867 238724 885282 464
summary(insurance_raw_imputed_df$HOME_VAL)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 158485 149351 232022 885282
summary(insurance_raw$JOB)
## Clerical Doctor Home Maker Lawyer
## 0 1271 246 641 835
## Manager Professional Student z_Blue Collar NA's
## 988 1117 712 1825 526
summary(insurance_raw_imputed_df$JOB)
## Clerical Doctor Home Maker Lawyer
## 0 1271 246 641 835
## Manager Professional Student z_Blue Collar
## 988 1117 712 1825
summary(insurance_raw$CAR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.000 1.000 8.000 8.328 12.000 28.000 510
summary(insurance_raw_imputed_df$CAR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.000 1.000 8.000 7.958 12.000 28.000
Now all the data is present. So we can start building our models.
We are splitting our data into training and test dataset - first for our classification problem.
#### Splitting the dataset into test and train dataset
n <- nrow(insurance_raw_imputed_df)
set.seed(123)
insurance_random_index <- insurance_raw_imputed_df[sample(n), ]
insurance.train.df <- insurance_random_index[1:as.integer(0.7*n),]
insurance.test.df <- insurance_random_index[as.integer(0.7*n +1):n, ]
table(insurance.test.df$TARGET_FLAG) / nrow(insurance.test.df)
##
## 0 1
## 0.7376691 0.2623309
table(insurance.train.df$TARGET_FLAG) / nrow(insurance.train.df)
##
## 0 1
## 0.7350299 0.2649701
Taking all features
### Building the first Logistic Regression model using all the variables:
logitModel1 <- glm(TARGET_FLAG ~ . - TARGET_AMT, data = insurance.train.df,
family = binomial(link = "logit"))
summary(logitModel1)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"),
## data = insurance.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4985 -0.6993 -0.3756 0.6141 2.9097
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.102e-03 3.365e-01 -0.021 0.983158
## KIDSDRIV 3.450e-01 7.551e-02 4.569 4.91e-06 ***
## AGE -3.817e-03 4.997e-03 -0.764 0.444950
## HOMEKIDS 3.738e-02 4.527e-02 0.826 0.408929
## YOJ -1.196e-02 1.042e-02 -1.148 0.251161
## INCOME -4.243e-06 1.550e-06 -2.737 0.006202 **
## PARENT1Yes 3.897e-01 1.355e-01 2.876 0.004023 **
## HOME_VAL -1.682e-06 4.625e-07 -3.637 0.000276 ***
## MSTATUSz_No 4.542e-01 1.067e-01 4.255 2.09e-05 ***
## SEXz_F -2.054e-01 1.419e-01 -1.447 0.147937
## EDUCATIONBachelors -3.038e-01 1.412e-01 -2.151 0.031461 *
## EDUCATIONMasters -2.759e-01 2.255e-01 -1.224 0.221046
## EDUCATIONPhD 2.527e-01 2.738e-01 0.923 0.356052
## EDUCATIONz_High School 6.786e-02 1.136e-01 0.597 0.550331
## JOBDoctor -1.365e+00 3.700e-01 -3.689 0.000225 ***
## JOBHome Maker -3.509e-01 1.728e-01 -2.030 0.042328 *
## JOBLawyer -3.407e-01 2.253e-01 -1.513 0.130364
## JOBManager -9.672e-01 1.740e-01 -5.560 2.70e-08 ***
## JOBProfessional -2.775e-01 1.504e-01 -1.845 0.065058 .
## JOBStudent -3.874e-01 1.608e-01 -2.409 0.015979 *
## JOBz_Blue Collar -9.860e-02 1.275e-01 -0.773 0.439290
## TRAVTIME 1.306e-02 2.363e-03 5.524 3.31e-08 ***
## CAR_USEPrivate -8.102e-01 1.120e-01 -7.235 4.64e-13 ***
## BLUEBOOK -2.508e-05 6.621e-06 -3.789 0.000151 ***
## TIF -6.224e-02 9.191e-03 -6.771 1.28e-11 ***
## CAR_TYPEPanel Truck 4.880e-01 2.132e-01 2.289 0.022088 *
## CAR_TYPEPickup 5.311e-01 1.227e-01 4.330 1.49e-05 ***
## CAR_TYPESports Car 1.042e+00 1.601e-01 6.510 7.53e-11 ***
## CAR_TYPEVan 4.248e-01 1.638e-01 2.592 0.009530 **
## CAR_TYPEz_SUV 7.916e-01 1.372e-01 5.770 7.95e-09 ***
## RED_CARyes -3.921e-02 1.107e-01 -0.354 0.723091
## OLDCLAIM -1.219e-05 4.867e-06 -2.505 0.012233 *
## CLM_FREQ 1.844e-01 3.590e-02 5.137 2.79e-07 ***
## REVOKEDYes 9.186e-01 1.152e-01 7.974 1.53e-15 ***
## MVR_PTS 1.107e-01 1.723e-02 6.425 1.32e-10 ***
## CAR_AGE -1.091e-03 9.494e-03 -0.115 0.908492
## URBANICITYz_Highly Rural/ Rural -2.366e+00 1.351e-01 -17.520 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6179.7 on 5343 degrees of freedom
## Residual deviance: 4693.7 on 5307 degrees of freedom
## AIC: 4767.7
##
## Number of Fisher Scoring iterations: 5
Removing the 4 statistically insignificant features from the model: 1. AGE 2. HOMEKIDS 3. YOJ 4. CAR_AGE
logitModel2 <- glm(TARGET_FLAG ~ KIDSDRIV + INCOME + PARENT1 + HOME_VAL + MSTATUS +
SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
CAR_TYPE + RED_CAR + OLDCLAIM +CLM_FREQ + REVOKED + MVR_PTS +
URBANICITY
, data = insurance.train.df,
family = binomial(link = "logit"))
summary(logitModel2)
##
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + PARENT1 + HOME_VAL +
## MSTATUS + SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK +
## TIF + CAR_TYPE + RED_CAR + OLDCLAIM + CLM_FREQ + REVOKED +
## MVR_PTS + URBANICITY, family = binomial(link = "logit"),
## data = insurance.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5212 -0.6976 -0.3772 0.6171 2.8879
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.551e-01 2.453e-01 -1.040 0.298401
## KIDSDRIV 3.658e-01 6.879e-02 5.318 1.05e-07 ***
## INCOME -4.345e-06 1.538e-06 -2.824 0.004738 **
## PARENT1Yes 4.746e-01 1.160e-01 4.090 4.31e-05 ***
## HOME_VAL -1.737e-06 4.610e-07 -3.769 0.000164 ***
## MSTATUSz_No 4.346e-01 1.014e-01 4.288 1.81e-05 ***
## SEXz_F -1.809e-01 1.405e-01 -1.288 0.197830
## EDUCATIONBachelors -3.093e-01 1.322e-01 -2.339 0.019321 *
## EDUCATIONMasters -2.939e-01 2.003e-01 -1.467 0.142377
## EDUCATIONPhD 2.418e-01 2.567e-01 0.942 0.346057
## EDUCATIONz_High School 6.615e-02 1.131e-01 0.585 0.558819
## JOBDoctor -1.392e+00 3.690e-01 -3.771 0.000162 ***
## JOBHome Maker -3.052e-01 1.642e-01 -1.859 0.062964 .
## JOBLawyer -3.604e-01 2.245e-01 -1.605 0.108488
## JOBManager -9.830e-01 1.732e-01 -5.675 1.39e-08 ***
## JOBProfessional -2.898e-01 1.498e-01 -1.935 0.052991 .
## JOBStudent -3.268e-01 1.528e-01 -2.138 0.032524 *
## JOBz_Blue Collar -1.069e-01 1.272e-01 -0.840 0.400822
## TRAVTIME 1.294e-02 2.361e-03 5.483 4.19e-08 ***
## CAR_USEPrivate -8.148e-01 1.119e-01 -7.283 3.26e-13 ***
## BLUEBOOK -2.636e-05 6.552e-06 -4.024 5.73e-05 ***
## TIF -6.211e-02 9.184e-03 -6.763 1.36e-11 ***
## CAR_TYPEPanel Truck 5.095e-01 2.125e-01 2.398 0.016499 *
## CAR_TYPEPickup 5.267e-01 1.225e-01 4.298 1.72e-05 ***
## CAR_TYPESports Car 1.026e+00 1.588e-01 6.458 1.06e-10 ***
## CAR_TYPEVan 4.375e-01 1.636e-01 2.674 0.007492 **
## CAR_TYPEz_SUV 7.734e-01 1.362e-01 5.679 1.35e-08 ***
## RED_CARyes -3.751e-02 1.106e-01 -0.339 0.734466
## OLDCLAIM -1.239e-05 4.864e-06 -2.547 0.010873 *
## CLM_FREQ 1.846e-01 3.587e-02 5.145 2.68e-07 ***
## REVOKEDYes 9.230e-01 1.151e-01 8.018 1.08e-15 ***
## MVR_PTS 1.120e-01 1.720e-02 6.511 7.47e-11 ***
## URBANICITYz_Highly Rural/ Rural -2.366e+00 1.350e-01 -17.530 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6179.7 on 5343 degrees of freedom
## Residual deviance: 4697.0 on 5311 degrees of freedom
## AIC: 4763
##
## Number of Fisher Scoring iterations: 5
This model looks good. Checking the results and metrics below.
predict_logit2 <- predict(logitModel2, newdata = insurance.test.df, type = "response")
predict_logit2_class <- ifelse(predict_logit2 > 0.5, 1, 0)
print(xtabs(~predict_logit2_class + insurance.test.df$TARGET_FLAG))
## insurance.test.df$TARGET_FLAG
## predict_logit2_class 0 1
## 0 1556 360
## 1 134 241
roc(insurance.test.df$TARGET_FLAG, predict_logit2, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = insurance.test.df$TARGET_FLAG, predictor = predict_logit2, plot = TRUE)
##
## Data: predict_logit2 in 1690 controls (insurance.test.df$TARGET_FLAG 0) < 601 cases (insurance.test.df$TARGET_FLAG 1).
## Area under the curve: 0.8028
Defining the metrics functions.
Accuracy_func(insurance.test.df$TARGET_FLAG, predict_logit2_class)
## [1] 0.7843736
Precision_func(insurance.test.df$TARGET_FLAG, predict_logit2_class)
## [1] 0.6426667
Sensitivity_func(insurance.test.df$TARGET_FLAG, predict_logit2_class)
## [1] 0.4009983
Specificity_func(insurance.test.df$TARGET_FLAG, predict_logit2_class)
## [1] 0.9207101
This looks to be our good model for classifying the data into whether the claim was filed or not by a customer.
We are taking only the rows which have target_flag = 1, as target_flag = 0 means 0 target_amount.
insurance_raw_claim <- insurance_raw_imputed_df[insurance_raw_imputed_df$TARGET_FLAG == 1,]
nrow(insurance_raw_claim)
## [1] 2017
# Now we will use this dataset to determine / predict the cost
boxplot(insurance_raw_claim$TARGET_AMT)
summary(insurance_raw_claim$TARGET_AMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.28 2585.00 4102.00 5621.12 5761.00 107586.14
Splitting into train and test dataset.
n2 <- nrow(insurance_raw_claim)
set.seed(123)
insurance_claim_random_index <- insurance_raw_claim[sample(n2), ]
insurance.claim.train.df <- insurance_claim_random_index[1:as.integer(0.8*n2),]
insurance.claim.test.df <- insurance_claim_random_index[as.integer(0.8*n2 +1):n2, ]
Building Linear regression model using all features.
linear_model1 <- lm(TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME +
PARENT1 +HOME_VAL + MSTATUS + SEX + EDUCATION + JOB + TRAVTIME +
CAR_USE + BLUEBOOK + TIF + CAR_TYPE + RED_CAR + OLDCLAIM + CLM_FREQ +
REVOKED + MVR_PTS + CAR_AGE + URBANICITY
, data = insurance.claim.train.df)
summary(linear_model1)
##
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME +
## PARENT1 + HOME_VAL + MSTATUS + SEX + EDUCATION + JOB + TRAVTIME +
## CAR_USE + BLUEBOOK + TIF + CAR_TYPE + RED_CAR + OLDCLAIM +
## CLM_FREQ + REVOKED + MVR_PTS + CAR_AGE + URBANICITY, data = insurance.claim.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8989 -3212 -1430 485 98271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.590e+03 1.703e+03 2.695 0.00711 **
## KIDSDRIV -2.460e+02 3.634e+02 -0.677 0.49842
## AGE 2.026e+01 2.413e+01 0.840 0.40118
## HOMEKIDS 8.019e+01 2.384e+02 0.336 0.73668
## YOJ 4.078e+01 5.607e+01 0.727 0.46724
## INCOME -2.639e-03 8.880e-03 -0.297 0.76635
## PARENT1Yes 3.436e+02 6.872e+02 0.500 0.61714
## HOME_VAL 1.257e-03 2.553e-03 0.492 0.62249
## MSTATUSz_No 5.510e+02 5.837e+02 0.944 0.34532
## SEXz_F -8.866e+02 7.671e+02 -1.156 0.24794
## EDUCATIONBachelors -1.858e+02 7.378e+02 -0.252 0.80118
## EDUCATIONMasters 7.574e+01 1.309e+03 0.058 0.95387
## EDUCATIONPhD 1.481e+03 1.673e+03 0.886 0.37595
## EDUCATIONz_High School -9.786e+02 5.782e+02 -1.693 0.09075 .
## JOBDoctor -2.766e+03 2.301e+03 -1.202 0.22960
## JOBHome Maker 7.478e-01 9.543e+02 0.001 0.99937
## JOBLawyer 3.335e+02 1.336e+03 0.250 0.80283
## JOBManager -9.229e+02 1.095e+03 -0.843 0.39934
## JOBProfessional 8.347e+02 8.142e+02 1.025 0.30545
## JOBStudent 9.372e+01 8.398e+02 0.112 0.91116
## JOBz_Blue Collar -1.444e+02 6.640e+02 -0.217 0.82789
## TRAVTIME -7.271e-01 1.257e+01 -0.058 0.95387
## CAR_USEPrivate -9.043e+02 5.979e+02 -1.512 0.13062
## BLUEBOOK 1.087e-01 3.583e-02 3.035 0.00245 **
## TIF -1.905e+01 4.941e+01 -0.386 0.69981
## CAR_TYPEPanel Truck -1.737e+03 1.161e+03 -1.495 0.13503
## CAR_TYPEPickup -6.063e+02 6.773e+02 -0.895 0.37084
## CAR_TYPESports Car 6.851e+02 8.611e+02 0.796 0.42638
## CAR_TYPEVan 5.506e+02 9.234e+02 0.596 0.55107
## CAR_TYPEz_SUV 3.578e+02 7.664e+02 0.467 0.64064
## RED_CARyes 1.244e+01 5.860e+02 0.021 0.98306
## OLDCLAIM 3.793e-02 2.728e-02 1.390 0.16459
## CLM_FREQ -2.578e+02 1.862e+02 -1.385 0.16628
## REVOKEDYes -1.364e+03 6.056e+02 -2.252 0.02448 *
## MVR_PTS 1.168e+02 7.869e+01 1.485 0.13784
## CAR_AGE -6.954e+01 5.199e+01 -1.338 0.18118
## URBANICITYz_Highly Rural/ Rural -8.996e+01 8.381e+02 -0.107 0.91453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7718 on 1576 degrees of freedom
## Multiple R-squared: 0.02945, Adjusted R-squared: 0.007279
## F-statistic: 1.328 on 36 and 1576 DF, p-value: 0.09325
Removing features.
linear_model3 <- lm(TARGET_AMT ~ BLUEBOOK + REVOKED + MVR_PTS + CAR_AGE
, data = insurance.claim.train.df)
summary(linear_model3)
##
## Call:
## lm(formula = TARGET_AMT ~ BLUEBOOK + REVOKED + MVR_PTS + CAR_AGE,
## data = insurance.claim.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8498 -3116 -1559 280 100533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4393.62261 491.86443 8.933 < 2e-16 ***
## BLUEBOOK 0.10642 0.02505 4.248 2.28e-05 ***
## REVOKEDYes -777.84852 482.46845 -1.612 0.107
## MVR_PTS 120.53697 74.14036 1.626 0.104
## CAR_AGE -37.07768 36.31514 -1.021 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7700 on 1608 degrees of freedom
## Multiple R-squared: 0.0143, Adjusted R-squared: 0.01185
## F-statistic: 5.832 on 4 and 1608 DF, p-value: 0.0001168
This is our final model.
eval_df <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/HW-4/Data/insurance-evaluation-data.csv",
header = TRUE, row.names = 1)
dim(eval_df)
## [1] 2141 25
anyNA(eval_df)
## [1] TRUE
eval_missing_df <- data.frame(apply(eval_df, 2, function(x) length(which(x == ''))))
eval_missing_df
## apply.eval_df..2..function.x..length.which.x.........
## TARGET_FLAG 0
## TARGET_AMT 0
## KIDSDRIV 0
## AGE 0
## HOMEKIDS 0
## YOJ 0
## INCOME 125
## PARENT1 0
## HOME_VAL 111
## MSTATUS 0
## SEX 0
## EDUCATION 0
## JOB 139
## TRAVTIME 0
## CAR_USE 0
## BLUEBOOK 0
## TIF 0
## CAR_TYPE 0
## RED_CAR 0
## OLDCLAIM 0
## CLM_FREQ 0
## REVOKED 0
## MVR_PTS 0
## CAR_AGE 0
## URBANICITY 0
eval_na_df1 <- data.frame(apply(eval_df, 2, function(x) length(which(is.na(x)))))
eval_na_df1
## apply.eval_df..2..function.x..length.which.is.na.x....
## TARGET_FLAG 2141
## TARGET_AMT 2141
## KIDSDRIV 0
## AGE 1
## HOMEKIDS 0
## YOJ 94
## INCOME 0
## PARENT1 0
## HOME_VAL 0
## MSTATUS 0
## SEX 0
## EDUCATION 0
## JOB 0
## TRAVTIME 0
## CAR_USE 0
## BLUEBOOK 0
## TIF 0
## CAR_TYPE 0
## RED_CAR 0
## OLDCLAIM 0
## CLM_FREQ 0
## REVOKED 0
## MVR_PTS 0
## CAR_AGE 129
## URBANICITY 0
eval_df$INCOME[eval_df$INCOME == ''] <- NA
eval_df$HOME_VAL[eval_df$HOME_VAL == ''] <- NA
eval_df$JOB[eval_df$JOB == ''] <- NA
eval_df$INCOME <- str_remove_all(eval_df$INCOME, "[$,]") %>% as.integer()
eval_df$HOME_VAL <- str_remove_all(eval_df$HOME_VAL, "[$,]") %>% as.integer()
eval_df$BLUEBOOK <- str_remove_all(eval_df$BLUEBOOK, "[$,]") %>% as.integer()
eval_df$OLDCLAIM <- str_remove_all(eval_df$OLDCLAIM, "[$,]") %>% as.integer()
eval_df1 <- subset(eval_df, select= -c(TARGET_FLAG, TARGET_AMT, JOB))
eval_df2 <- subset(eval_df, select= c(TARGET_FLAG, TARGET_AMT, JOB))
eval_df_imputed <- mice(data = eval_df1, m = 1,
method = "pmm", maxit = 5, seed = 500)
##
## iter imp variable
## 1 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 2 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 3 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 4 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 5 1 AGE YOJ INCOME HOME_VAL CAR_AGE
eval_df_imputed_df <- mice::complete(eval_df_imputed, 1)
eval_final_df <- cbind(eval_df_imputed_df, eval_df2)
head(eval_final_df)
## KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX
## 3 0 48 0 11 52881 No 0 z_No M
## 9 1 40 1 11 50815 Yes 0 z_No M
## 10 0 44 2 12 43486 Yes 0 z_No z_F
## 18 0 35 2 9 21204 Yes 0 z_No M
## 21 0 59 0 12 87460 No 0 z_No M
## 30 0 46 0 14 62615 No 207519 Yes M
## EDUCATION TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR
## 3 Bachelors 26 Private 21970 1 Van yes
## 9 z_High School 21 Private 18930 6 Minivan no
## 10 z_High School 30 Commercial 5900 10 z_SUV no
## 18 z_High School 74 Private 9230 6 Pickup no
## 21 z_High School 45 Private 15420 1 Minivan yes
## 30 Bachelors 7 Commercial 25660 1 Panel Truck no
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 3 0 0 No 2 10 Highly Urban/ Urban
## 9 3295 1 No 2 1 Highly Urban/ Urban
## 10 0 0 No 0 10 z_Highly Rural/ Rural
## 18 0 0 Yes 0 4 z_Highly Rural/ Rural
## 21 44857 2 No 4 1 Highly Urban/ Urban
## 30 2119 1 No 2 12 Highly Urban/ Urban
## TARGET_FLAG TARGET_AMT JOB
## 3 NA NA Manager
## 9 NA NA Manager
## 10 NA NA z_Blue Collar
## 18 NA NA Clerical
## 21 NA NA Manager
## 30 NA NA Professional
dim(eval_final_df)
## [1] 2141 25
eval_final_df <- eval_final_df[,c(23,24, 1:10, 25, 11:22)]
predict_eval_logit2 <- predict(logitModel2, newdata = eval_final_df, type = "response")
eval_final_df$TARGET_FLAG <- ifelse(predict_eval_logit2 > 0.5, 1, 0)
eval_amt_df <- eval_final_df[eval_final_df$TARGET_FLAG == 1, ]
eval_amt_df$TARGET_AMT <- predict(linear_model3, newdata = eval_amt_df)
eval_final_df <- rbind(eval_final_df[eval_final_df$TARGET_FLAG == 0, ], eval_amt_df)