#Import all the libs
library('PerformanceAnalytics')
## Warning: package 'PerformanceAnalytics' was built under R version 3.6.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.6.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.6.3
## Loading required package: ggplot2
library('ggplot2')
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:PerformanceAnalytics':
##
## textplot
## The following object is masked from 'package:stats':
##
## lowess
library('tidyverse')
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.5
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.3 v forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks xts::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks xts::last()
library('car')
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library("glmnet")
## Warning: package 'glmnet' was built under R version 3.6.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loaded glmnet 3.0-2
library("dplyr")
library('rpart')
library('tidyverse')
library('corrgram')
## Warning: package 'corrgram' was built under R version 3.6.2
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
library('glmnet')
library('boot')
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
Load the data
bankruptcy.data<-read.csv("~/Assignment/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")
head(bankruptcy.data)
## Customer State Customer.Lifetime.Value Response Coverage Education
## 1 BU79786 Washington 2763.519 No Basic Bachelor
## 2 QZ44356 Arizona 6979.536 No Extended Bachelor
## 3 AI49188 Nevada 12887.432 No Premium Bachelor
## 4 WW63253 California 7645.862 No Basic Bachelor
## 5 HB64268 Washington 2813.693 No Basic Bachelor
## 6 OC83172 Oregon 8256.298 Yes Basic Bachelor
## Effective.To.Date EmploymentStatus Gender Income Location.Code
## 1 2/24/11 Employed F 56274 Suburban
## 2 1/31/11 Unemployed F 0 Suburban
## 3 2/19/11 Employed F 48767 Suburban
## 4 1/20/11 Unemployed M 0 Suburban
## 5 2/3/11 Employed M 43836 Rural
## 6 1/25/11 Employed F 62902 Rural
## Marital.Status Monthly.Premium.Auto Months.Since.Last.Claim
## 1 Married 69 32
## 2 Single 94 13
## 3 Married 108 18
## 4 Married 106 18
## 5 Single 73 12
## 6 Married 69 14
## Months.Since.Policy.Inception Number.of.Open.Complaints
## 1 5 0
## 2 42 0
## 3 38 0
## 4 65 0
## 5 44 0
## 6 94 0
## Number.of.Policies Policy.Type Policy Renew.Offer.Type
## 1 1 Corporate Auto Corporate L3 Offer1
## 2 8 Personal Auto Personal L3 Offer3
## 3 2 Personal Auto Personal L3 Offer1
## 4 7 Corporate Auto Corporate L2 Offer1
## 5 1 Personal Auto Personal L1 Offer1
## 6 2 Personal Auto Personal L3 Offer2
## Sales.Channel Total.Claim.Amount Vehicle.Class Vehicle.Size
## 1 Agent 384.8111 Two-Door Car Medsize
## 2 Agent 1131.4649 Four-Door Car Medsize
## 3 Agent 566.4722 Two-Door Car Medsize
## 4 Call Center 529.8813 SUV Medsize
## 5 Agent 138.1309 Four-Door Car Medsize
## 6 Web 159.3830 Two-Door Car Medsize
Data Exploration Check the datatype of the columns
sapply(bankruptcy.data, class)
## Customer State
## "factor" "factor"
## Customer.Lifetime.Value Response
## "numeric" "factor"
## Coverage Education
## "factor" "factor"
## Effective.To.Date EmploymentStatus
## "factor" "factor"
## Gender Income
## "factor" "integer"
## Location.Code Marital.Status
## "factor" "factor"
## Monthly.Premium.Auto Months.Since.Last.Claim
## "integer" "integer"
## Months.Since.Policy.Inception Number.of.Open.Complaints
## "integer" "integer"
## Number.of.Policies Policy.Type
## "integer" "factor"
## Policy Renew.Offer.Type
## "factor" "factor"
## Sales.Channel Total.Claim.Amount
## "factor" "numeric"
## Vehicle.Class Vehicle.Size
## "factor" "factor"
summary(bankruptcy.data)
## Customer State Customer.Lifetime.Value Response
## AA10041: 1 Arizona :1703 Min. : 1898 No :7826
## AA11235: 1 California:3150 1st Qu.: 3994 Yes:1308
## AA16582: 1 Nevada : 882 Median : 5780
## AA30683: 1 Oregon :2601 Mean : 8005
## AA34092: 1 Washington: 798 3rd Qu.: 8962
## AA35519: 1 Max. :83325
## (Other):9128
## Coverage Education Effective.To.Date
## Basic :5568 Bachelor :2748 1/10/11: 195
## Extended:2742 College :2681 1/27/11: 194
## Premium : 824 Doctor : 342 2/14/11: 186
## High School or Below:2622 1/26/11: 181
## Master : 741 1/17/11: 180
## 1/19/11: 179
## (Other):8019
## EmploymentStatus Gender Income Location.Code
## Disabled : 405 F:4658 Min. : 0 Rural :1773
## Employed :5698 M:4476 1st Qu.: 0 Suburban:5779
## Medical Leave: 432 Median :33890 Urban :1582
## Retired : 282 Mean :37657
## Unemployed :2317 3rd Qu.:62320
## Max. :99981
##
## Marital.Status Monthly.Premium.Auto Months.Since.Last.Claim
## Divorced:1369 Min. : 61.00 Min. : 0.0
## Married :5298 1st Qu.: 68.00 1st Qu.: 6.0
## Single :2467 Median : 83.00 Median :14.0
## Mean : 93.22 Mean :15.1
## 3rd Qu.:109.00 3rd Qu.:23.0
## Max. :298.00 Max. :35.0
##
## Months.Since.Policy.Inception Number.of.Open.Complaints
## Min. : 0.00 Min. :0.0000
## 1st Qu.:24.00 1st Qu.:0.0000
## Median :48.00 Median :0.0000
## Mean :48.06 Mean :0.3844
## 3rd Qu.:71.00 3rd Qu.:0.0000
## Max. :99.00 Max. :5.0000
##
## Number.of.Policies Policy.Type Policy
## Min. :1.000 Corporate Auto:1968 Personal L3 :3426
## 1st Qu.:1.000 Personal Auto :6788 Personal L2 :2122
## Median :2.000 Special Auto : 378 Personal L1 :1240
## Mean :2.966 Corporate L3:1014
## 3rd Qu.:4.000 Corporate L2: 595
## Max. :9.000 Corporate L1: 359
## (Other) : 378
## Renew.Offer.Type Sales.Channel Total.Claim.Amount
## Offer1:3752 Agent :3477 Min. : 0.099
## Offer2:2926 Branch :2567 1st Qu.: 272.258
## Offer3:1432 Call Center:1765 Median : 383.945
## Offer4:1024 Web :1325 Mean : 434.089
## 3rd Qu.: 547.515
## Max. :2893.240
##
## Vehicle.Class Vehicle.Size
## Four-Door Car:4621 Large : 946
## Luxury Car : 163 Medsize:6424
## Luxury SUV : 184 Small :1764
## Sports Car : 484
## SUV :1796
## Two-Door Car :1886
##
str(bankruptcy.data)
## 'data.frame': 9134 obs. of 24 variables:
## $ Customer : Factor w/ 9134 levels "AA10041","AA11235",..: 601 5947 97 8017 2489 4948 8434 756 1352 548 ...
## $ State : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
## $ Customer.Lifetime.Value : num 2764 6980 12887 7646 2814 ...
## $ Response : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
## $ Coverage : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
## $ Education : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
## $ Effective.To.Date : Factor w/ 59 levels "1/1/11","1/10/11",..: 48 25 42 13 53 18 48 10 19 40 ...
## $ EmploymentStatus : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
## $ Income : int 56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
## $ Location.Code : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
## $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
## $ Monthly.Premium.Auto : int 69 94 108 106 73 69 67 101 71 93 ...
## $ Months.Since.Last.Claim : int 32 13 18 18 12 14 0 0 13 17 ...
## $ Months.Since.Policy.Inception: int 5 42 38 65 44 94 13 68 3 7 ...
## $ Number.of.Open.Complaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Number.of.Policies : int 1 8 2 7 1 2 9 4 2 8 ...
## $ Policy.Type : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
## $ Policy : Factor w/ 9 levels "Corporate L1",..: 3 6 6 2 4 6 3 3 3 8 ...
## $ Renew.Offer.Type : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
## $ Sales.Channel : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
## $ Total.Claim.Amount : num 385 1131 566 530 138 ...
## $ Vehicle.Class : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
## $ Vehicle.Size : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
glimpse(bankruptcy.data)
## Observations: 9,134
## Variables: 24
## $ Customer <fct> BU79786, QZ44356, AI49188, WW632...
## $ State <fct> Washington, Arizona, Nevada, Cal...
## $ Customer.Lifetime.Value <dbl> 2763.519, 6979.536, 12887.432, 7...
## $ Response <fct> No, No, No, No, No, Yes, Yes, No...
## $ Coverage <fct> Basic, Extended, Premium, Basic,...
## $ Education <fct> Bachelor, Bachelor, Bachelor, Ba...
## $ Effective.To.Date <fct> 2/24/11, 1/31/11, 2/19/11, 1/20/...
## $ EmploymentStatus <fct> Employed, Unemployed, Employed, ...
## $ Gender <fct> F, F, F, M, M, F, F, M, M, F, M,...
## $ Income <int> 56274, 0, 48767, 0, 43836, 62902...
## $ Location.Code <fct> Suburban, Suburban, Suburban, Su...
## $ Marital.Status <fct> Married, Single, Married, Marrie...
## $ Monthly.Premium.Auto <int> 69, 94, 108, 106, 73, 69, 67, 10...
## $ Months.Since.Last.Claim <int> 32, 13, 18, 18, 12, 14, 0, 0, 13...
## $ Months.Since.Policy.Inception <int> 5, 42, 38, 65, 44, 94, 13, 68, 3...
## $ Number.of.Open.Complaints <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Number.of.Policies <int> 1, 8, 2, 7, 1, 2, 9, 4, 2, 8, 3,...
## $ Policy.Type <fct> Corporate Auto, Personal Auto, P...
## $ Policy <fct> Corporate L3, Personal L3, Perso...
## $ Renew.Offer.Type <fct> Offer1, Offer3, Offer1, Offer1, ...
## $ Sales.Channel <fct> Agent, Agent, Agent, Call Center...
## $ Total.Claim.Amount <dbl> 384.81115, 1131.46493, 566.47225...
## $ Vehicle.Class <fct> Two-Door Car, Four-Door Car, Two...
## $ Vehicle.Size <fct> Medsize, Medsize, Medsize, Medsi...
# Use sapply() function to count the number of observations with each feature that contains.
sapply(bankruptcy.data, function(x) sum(is.na(x)))
## Customer State
## 0 0
## Customer.Lifetime.Value Response
## 0 0
## Coverage Education
## 0 0
## Effective.To.Date EmploymentStatus
## 0 0
## Gender Income
## 0 0
## Location.Code Marital.Status
## 0 0
## Monthly.Premium.Auto Months.Since.Last.Claim
## 0 0
## Months.Since.Policy.Inception Number.of.Open.Complaints
## 0 0
## Number.of.Policies Policy.Type
## 0 0
## Policy Renew.Offer.Type
## 0 0
## Sales.Channel Total.Claim.Amount
## 0 0
## Vehicle.Class Vehicle.Size
## 0 0
#Similarly, the number of unique observations per column is revealed below.
sapply(bankruptcy.data, function(x) length(unique(x)))
## Customer State
## 9134 5
## Customer.Lifetime.Value Response
## 8041 2
## Coverage Education
## 3 5
## Effective.To.Date EmploymentStatus
## 59 5
## Gender Income
## 2 5694
## Location.Code Marital.Status
## 3 3
## Monthly.Premium.Auto Months.Since.Last.Claim
## 202 36
## Months.Since.Policy.Inception Number.of.Open.Complaints
## 100 6
## Number.of.Policies Policy.Type
## 9 3
## Policy Renew.Offer.Type
## 9 4
## Sales.Channel Total.Claim.Amount
## 4 5106
## Vehicle.Class Vehicle.Size
## 6 3
Using the missmap() function under the Amelia package, the visualization of the amount of missing and observed values per features is observed below. Most information in the Cabin and Age features are missing in both datasets.
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.6.3
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(bankruptcy.data, main = "Missing Values vs. Observed")
Your data contains 9134 customers with information about their income, education, gender,residence and so on. Each customer owns a car and you as entrepreneur offers 4 different car insurances to them. The target of this dataset is the Response. The response can be “Yes” - the customer accept the offer and “No” - the customer didn´t accept the offer.
Graphs
# Relation between numerical variables
nums <- unlist(lapply(bankruptcy.data, is.numeric))
bankruptcy_numeric<-bankruptcy.data[,nums]
corr<-cor(bankruptcy_numeric)
library("PerformanceAnalytics")
#my_data <- bankruptcy_numeric
chart.Correlation(corr, histogram=TRUE, pch=19)
library(ggcorrplot)
ggcorrplot(corr, hc.order = TRUE, type = "lower",lab = TRUE)
# Exploratory Data Analysis
# Relation between categorial variables and response variable
# Gender - > Response
library(ggcorrplot)
tbl_gen <- with(bankruptcy.data, table(Gender, Response))
ggplot(as.data.frame(tbl_gen), aes(factor(Response),Freq, fill=Gender) )+ geom_col(position = 'dodge')
# State - > Response
library(ggcorrplot)
tbl_State <- with(bankruptcy.data, table(State, Response))
ggplot(as.data.frame(tbl_State), aes(factor(State),Freq, fill=Response) )+ geom_col(position = 'dodge')
# Coverage -> Response
library(ggcorrplot)
tbl_Coverage <- with(bankruptcy.data, table(Coverage, Response))
ggplot(as.data.frame(tbl_Coverage), aes(factor(Coverage),Freq, fill=Response) )+ geom_col(position = 'dodge')
# Education -> Response
library(ggcorrplot)
tbl_Education <- with(bankruptcy.data, table(Education, Response))
ggplot(as.data.frame(tbl_Coverage), aes(factor(Coverage),Freq, fill=Response) )+ geom_col(position = 'dodge')
# EmploymentStatus -> Response
library(ggcorrplot)
tbl_EmploymentStatus <- with(bankruptcy.data, table(EmploymentStatus, Response))
ggplot(as.data.frame(tbl_EmploymentStatus), aes(factor(EmploymentStatus),Freq, fill=Response) )+ geom_col(position = 'dodge')
# Income -> Response
library(ggcorrplot)
tbl_Income <- with(bankruptcy.data, table(Income, Response))
ggplot(as.data.frame(tbl_Income), aes(factor(Response),Freq, fill=Income) )+ geom_col(position = 'dodge')
#Location Code - > Response
library(ggcorrplot)
tbl_LocationCode <- with(bankruptcy.data, table(Location.Code, Response))
ggplot(as.data.frame(tbl_LocationCode), aes(factor(Location.Code),Freq, fill=Response) )+ geom_col(position = 'dodge')
#Marital.Status -> Response
library(ggcorrplot)
tbl_MaritalStatus <- with(bankruptcy.data, table(Marital.Status, Response))
ggplot(as.data.frame(tbl_MaritalStatus), aes(factor(Marital.Status),Freq, fill=Response) )+ geom_col(position = 'dodge')
#Monthly.Premium.Auto -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Monthly.Premium.Auto,fill=Response)) + geom_histogram(position = 'dodge')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Months.Since.Last.Claim -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Months.Since.Last.Claim ,fill=Response)) + geom_histogram(position = 'dodge')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Months.Since.Policy.Inception -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Months.Since.Policy.Inception ,fill=Response)) + geom_histogram(position = 'dodge')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Number.of.Open.Complaints -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Number.of.Open.Complaints ,fill=Response)) + geom_histogram(position = 'dodge')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Number.of.Policies -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Number.of.Policies ,fill=Response)) + geom_histogram(position = 'dodge')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Policy.Type -> Response
library(ggcorrplot)
tbl_PolicyType <- with(bankruptcy.data, table(Policy.Type, Response))
ggplot(as.data.frame(tbl_PolicyType), aes(factor(Policy.Type),Freq, fill=Response) )+ geom_col(position = 'dodge')
#Renew.Offer.Type -> Response
library(ggcorrplot)
tbl_RenewOfferType <- with(bankruptcy.data, table(Renew.Offer.Type, Response))
ggplot(as.data.frame(tbl_RenewOfferType), aes(factor(Renew.Offer.Type),Freq, fill=Response) )+ geom_col(position = 'dodge')
#Sales.Channel -> Response
library(ggcorrplot)
tbl_SalesChannel <- with(bankruptcy.data, table(Sales.Channel, Response))
ggplot(as.data.frame(tbl_SalesChannel), aes(factor(Sales.Channel),Freq, fill=Response) )+ geom_col(position = 'dodge')
#Total.Claim.Amount -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Total.Claim.Amount ,fill=Response)) + geom_histogram(position = 'dodge')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Vehicle.Class -> Response
library(ggcorrplot)
tbl_VehicleClass <- with(bankruptcy.data, table(Vehicle.Class, Response))
ggplot(as.data.frame(tbl_VehicleClass), aes(factor(Vehicle.Class),Freq, fill=Response) )+ geom_col(position = 'dodge')
#Vehicle.Size -> Response
library(ggcorrplot)
tbl_VehicleSize <- with(bankruptcy.data, table(Vehicle.Size, Response))
ggplot(as.data.frame(tbl_VehicleSize), aes(factor(Vehicle.Size),Freq, fill=Response) )+ geom_col(position = 'dodge')
Data Wrangling - cleaning All categorial features are well distributet, so I will keep them and encode them to numerical data. Some columns don´t make sense or are not so important, e.g. Customer (because it´s just a unique number), Policy is the same as Policy Type, Effective To Date is also not important, so I will drop them. The data is inbalanced regarding the outcome “Response”
bankruptcy.data = subset(bankruptcy.data , select = -c(Customer,Policy,Effective.To.Date) )
str(bankruptcy.data)
## 'data.frame': 9134 obs. of 21 variables:
## $ State : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
## $ Customer.Lifetime.Value : num 2764 6980 12887 7646 2814 ...
## $ Response : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
## $ Coverage : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
## $ Education : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
## $ EmploymentStatus : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
## $ Income : int 56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
## $ Location.Code : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
## $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
## $ Monthly.Premium.Auto : int 69 94 108 106 73 69 67 101 71 93 ...
## $ Months.Since.Last.Claim : int 32 13 18 18 12 14 0 0 13 17 ...
## $ Months.Since.Policy.Inception: int 5 42 38 65 44 94 13 68 3 7 ...
## $ Number.of.Open.Complaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Number.of.Policies : int 1 8 2 7 1 2 9 4 2 8 ...
## $ Policy.Type : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
## $ Renew.Offer.Type : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
## $ Sales.Channel : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
## $ Total.Claim.Amount : num 385 1131 566 530 138 ...
## $ Vehicle.Class : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
## $ Vehicle.Size : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
#Encode the categorial Data to numerical
#Step 1
encode_ordinal <- function(x, order = unique(x)) {
x <- as.numeric(factor(x, levels = order, exclude = NULL))
x
}
table(bankruptcy.data[["Response"]], encode_ordinal(bankruptcy.data[["Response"]]), useNA = "ifany")
##
## 1 2
## No 7826 0
## Yes 0 1308
bankruptcy.data.new <- bankruptcy.data
bankruptcy.data.new[["Response"]] <- encode_ordinal(bankruptcy.data[["Response"]])
head(bankruptcy.data.new)
## State Customer.Lifetime.Value Response Coverage Education
## 1 Washington 2763.519 1 Basic Bachelor
## 2 Arizona 6979.536 1 Extended Bachelor
## 3 Nevada 12887.432 1 Premium Bachelor
## 4 California 7645.862 1 Basic Bachelor
## 5 Washington 2813.693 1 Basic Bachelor
## 6 Oregon 8256.298 2 Basic Bachelor
## EmploymentStatus Gender Income Location.Code Marital.Status
## 1 Employed F 56274 Suburban Married
## 2 Unemployed F 0 Suburban Single
## 3 Employed F 48767 Suburban Married
## 4 Unemployed M 0 Suburban Married
## 5 Employed M 43836 Rural Single
## 6 Employed F 62902 Rural Married
## Monthly.Premium.Auto Months.Since.Last.Claim
## 1 69 32
## 2 94 13
## 3 108 18
## 4 106 18
## 5 73 12
## 6 69 14
## Months.Since.Policy.Inception Number.of.Open.Complaints
## 1 5 0
## 2 42 0
## 3 38 0
## 4 65 0
## 5 44 0
## 6 94 0
## Number.of.Policies Policy.Type Renew.Offer.Type Sales.Channel
## 1 1 Corporate Auto Offer1 Agent
## 2 8 Personal Auto Offer3 Agent
## 3 2 Personal Auto Offer1 Agent
## 4 7 Corporate Auto Offer1 Call Center
## 5 1 Personal Auto Offer1 Agent
## 6 2 Personal Auto Offer2 Web
## Total.Claim.Amount Vehicle.Class Vehicle.Size
## 1 384.8111 Two-Door Car Medsize
## 2 1131.4649 Four-Door Car Medsize
## 3 566.4722 Two-Door Car Medsize
## 4 529.8813 SUV Medsize
## 5 138.1309 Four-Door Car Medsize
## 6 159.3830 Two-Door Car Medsize
str(bankruptcy.data.new)
## 'data.frame': 9134 obs. of 21 variables:
## $ State : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
## $ Customer.Lifetime.Value : num 2764 6980 12887 7646 2814 ...
## $ Response : num 1 1 1 1 1 2 2 1 2 1 ...
## $ Coverage : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
## $ Education : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
## $ EmploymentStatus : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
## $ Income : int 56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
## $ Location.Code : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
## $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
## $ Monthly.Premium.Auto : int 69 94 108 106 73 69 67 101 71 93 ...
## $ Months.Since.Last.Claim : int 32 13 18 18 12 14 0 0 13 17 ...
## $ Months.Since.Policy.Inception: int 5 42 38 65 44 94 13 68 3 7 ...
## $ Number.of.Open.Complaints : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Number.of.Policies : int 1 8 2 7 1 2 9 4 2 8 ...
## $ Policy.Type : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
## $ Renew.Offer.Type : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
## $ Sales.Channel : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
## $ Total.Claim.Amount : num 385 1131 566 530 138 ...
## $ Vehicle.Class : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
## $ Vehicle.Size : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
# Correlation Graph
#Analyzing the relationship between feature variables and the target variable
nums_new <- unlist(lapply(bankruptcy.data.new, is.numeric))
bankruptcy_numeric_new<-bankruptcy.data.new[,nums_new]
corrnew<-cor(bankruptcy_numeric_new)
library(ggcorrplot)
ggcorrplot(corrnew, hc.order = TRUE, type = "lower",lab = TRUE)
library("ggplot2")
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
melted_cormat <- melt(corrnew)
head(melted_cormat)
## Var1 Var2 value
## 1 Customer.Lifetime.Value Customer.Lifetime.Value 1.000000000
## 2 Response Customer.Lifetime.Value -0.008929582
## 3 Income Customer.Lifetime.Value 0.024365661
## 4 Monthly.Premium.Auto Customer.Lifetime.Value 0.396261738
## 5 Months.Since.Last.Claim Customer.Lifetime.Value 0.011516682
## 6 Months.Since.Policy.Inception Customer.Lifetime.Value 0.009418381
library(ggplot2)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()+theme(axis.text.x=element_text(angle = 90))
Model building Logistic regression
bankruptcy.data.new$Response[bankruptcy.data.new$Response==1] <- 0
bankruptcy.data.new$Response[bankruptcy.data.new$Response==2] <- 1
bankruptcy.datas <- bankruptcy.data[ , -which(names(bankruptcy.data) %in% c("Customer","Policy","Effective.To.Date"))]
#Split the data
set.seed(13255870)
index <- sample(nrow(bankruptcy.data.new),nrow(bankruptcy.data.new)*0.70)
bankruptcy.train = bankruptcy.data.new[index,]
bankruptcy.test = bankruptcy.data.new[-index,]
str(bankruptcy.train)
## 'data.frame': 6393 obs. of 21 variables:
## $ State : Factor w/ 5 levels "Arizona","California",..: 1 4 4 2 4 1 2 1 4 1 ...
## $ Customer.Lifetime.Value : num 4014 5511 8305 2787 8677 ...
## $ Response : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Coverage : Factor w/ 3 levels "Basic","Extended",..: 2 1 2 1 1 2 3 1 1 2 ...
## $ Education : Factor w/ 5 levels "Bachelor","College",..: 3 4 2 1 4 2 2 5 2 4 ...
## $ EmploymentStatus : Factor w/ 5 levels "Disabled","Employed",..: 2 5 5 2 2 2 2 2 5 2 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 2 2 2 2 1 1 1 2 ...
## $ Income : int 37384 0 0 38667 76214 25899 92850 51199 0 53603 ...
## $ Location.Code : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 1 3 2 2 2 2 2 ...
## $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 3 2 1 3 1 3 2 ...
## $ Monthly.Premium.Auto : int 99 73 122 72 72 79 104 74 72 132 ...
## $ Months.Since.Last.Claim : int 9 24 22 8 7 10 3 19 15 30 ...
## $ Months.Since.Policy.Inception: int 17 57 14 67 48 11 28 52 70 1 ...
## $ Number.of.Open.Complaints : int 0 0 2 0 0 0 1 0 0 1 ...
## $ Number.of.Policies : int 1 4 9 1 2 8 2 1 2 1 ...
## $ Policy.Type : Factor w/ 3 levels "Corporate Auto",..: 2 2 1 2 3 2 2 1 2 2 ...
## $ Renew.Offer.Type : Factor w/ 4 levels "Offer1","Offer2",..: 2 1 2 1 2 4 1 1 2 1 ...
## $ Sales.Channel : Factor w/ 4 levels "Agent","Branch",..: 1 2 1 1 4 2 1 2 2 2 ...
## $ Total.Claim.Amount : num 475 526 681 159 203 ...
## $ Vehicle.Class : Factor w/ 6 levels "Four-Door Car",..: 1 1 5 1 1 6 1 1 1 4 ...
## $ Vehicle.Size : Factor w/ 3 levels "Large","Medsize",..: 2 1 3 2 2 2 3 3 2 2 ...
Model building Logistic Regression
bankruptcy.glm<-glm(Response~.,family = binomial,data = bankruptcy.train)
summary(bankruptcy.glm)
##
## Call:
## glm(formula = Response ~ ., family = binomial, data = bankruptcy.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.32576 -0.56835 -0.37340 -0.00021 3.11850
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.934e+00 5.122e-01 -3.775 0.000160 ***
## StateCalifornia 4.836e-02 1.159e-01 0.417 0.676380
## StateNevada -9.692e-03 1.605e-01 -0.060 0.951845
## StateOregon -2.025e-02 1.207e-01 -0.168 0.866821
## StateWashington -6.128e-02 1.660e-01 -0.369 0.711951
## Customer.Lifetime.Value -6.050e-06 6.443e-06 -0.939 0.347675
## CoverageExtended -6.962e-02 1.537e-01 -0.453 0.650504
## CoveragePremium -7.652e-02 3.238e-01 -0.236 0.813178
## EducationCollege 1.106e-01 1.062e-01 1.042 0.297565
## EducationDoctor 4.536e-01 2.051e-01 2.211 0.027021 *
## EducationHigh School or Below 1.951e-02 1.079e-01 0.181 0.856485
## EducationMaster 3.813e-01 1.555e-01 2.452 0.014205 *
## EmploymentStatusEmployed -2.355e-01 1.921e-01 -1.226 0.220202
## EmploymentStatusMedical Leave 1.111e-01 2.288e-01 0.486 0.627068
## EmploymentStatusRetired 2.508e+00 2.529e-01 9.916 < 2e-16 ***
## EmploymentStatusUnemployed -6.263e-01 1.992e-01 -3.144 0.001669 **
## GenderM 5.920e-02 8.173e-02 0.724 0.468852
## Income 4.149e-06 2.328e-06 1.782 0.074716 .
## Location.CodeSuburban 1.430e+00 1.789e-01 7.993 1.32e-15 ***
## Location.CodeUrban 9.295e-02 1.759e-01 0.528 0.597219
## Marital.StatusMarried -4.718e-01 1.090e-01 -4.327 1.51e-05 ***
## Marital.StatusSingle -4.882e-01 1.293e-01 -3.775 0.000160 ***
## Monthly.Premium.Auto 8.108e-03 6.248e-03 1.298 0.194373
## Months.Since.Last.Claim -4.898e-03 4.103e-03 -1.194 0.232553
## Months.Since.Policy.Inception 2.724e-04 1.450e-03 0.188 0.851032
## Number.of.Open.Complaints -5.366e-02 4.638e-02 -1.157 0.247243
## Number.of.Policies -2.426e-02 1.710e-02 -1.419 0.155943
## Policy.TypePersonal Auto 2.368e-02 1.006e-01 0.235 0.813998
## Policy.TypeSpecial Auto 3.530e-01 2.044e-01 1.727 0.084129 .
## Renew.Offer.TypeOffer2 6.859e-01 8.850e-02 7.751 9.14e-15 ***
## Renew.Offer.TypeOffer3 -2.389e+00 2.675e-01 -8.931 < 2e-16 ***
## Renew.Offer.TypeOffer4 -1.679e+01 2.318e+02 -0.072 0.942264
## Sales.ChannelBranch -5.364e-01 1.008e-01 -5.324 1.01e-07 ***
## Sales.ChannelCall Center -4.070e-01 1.143e-01 -3.562 0.000368 ***
## Sales.ChannelWeb -6.886e-01 1.388e-01 -4.960 7.06e-07 ***
## Total.Claim.Amount -1.479e-03 3.367e-04 -4.392 1.12e-05 ***
## Vehicle.ClassLuxury Car -4.072e-01 8.933e-01 -0.456 0.648550
## Vehicle.ClassLuxury SUV -3.128e-02 8.557e-01 -0.037 0.970843
## Vehicle.ClassSports Car 3.149e-01 3.162e-01 0.996 0.319193
## Vehicle.ClassSUV 2.841e-01 2.799e-01 1.015 0.310060
## Vehicle.ClassTwo-Door Car 6.233e-02 1.078e-01 0.578 0.563177
## Vehicle.SizeMedsize -2.747e-01 1.266e-01 -2.170 0.030018 *
## Vehicle.SizeSmall -6.271e-01 1.530e-01 -4.098 4.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5203.1 on 6392 degrees of freedom
## Residual deviance: 4031.5 on 6350 degrees of freedom
## AIC: 4117.5
##
## Number of Fisher Scoring iterations: 17
#Model Statistics
bankruptcy.glm$coefficients
## (Intercept) StateCalifornia
## -1.933768e+00 4.836226e-02
## StateNevada StateOregon
## -9.691750e-03 -2.024917e-02
## StateWashington Customer.Lifetime.Value
## -6.128259e-02 -6.050245e-06
## CoverageExtended CoveragePremium
## -6.962036e-02 -7.652196e-02
## EducationCollege EducationDoctor
## 1.106017e-01 4.536284e-01
## EducationHigh School or Below EducationMaster
## 1.951305e-02 3.813148e-01
## EmploymentStatusEmployed EmploymentStatusMedical Leave
## -2.355463e-01 1.111411e-01
## EmploymentStatusRetired EmploymentStatusUnemployed
## 2.508130e+00 -6.263096e-01
## GenderM Income
## 5.920365e-02 4.149380e-06
## Location.CodeSuburban Location.CodeUrban
## 1.430268e+00 9.294908e-02
## Marital.StatusMarried Marital.StatusSingle
## -4.718270e-01 -4.881936e-01
## Monthly.Premium.Auto Months.Since.Last.Claim
## 8.108436e-03 -4.897842e-03
## Months.Since.Policy.Inception Number.of.Open.Complaints
## 2.723917e-04 -5.366044e-02
## Number.of.Policies Policy.TypePersonal Auto
## -2.425882e-02 2.367814e-02
## Policy.TypeSpecial Auto Renew.Offer.TypeOffer2
## 3.529783e-01 6.859085e-01
## Renew.Offer.TypeOffer3 Renew.Offer.TypeOffer4
## -2.389083e+00 -1.678914e+01
## Sales.ChannelBranch Sales.ChannelCall Center
## -5.364338e-01 -4.069999e-01
## Sales.ChannelWeb Total.Claim.Amount
## -6.885759e-01 -1.478793e-03
## Vehicle.ClassLuxury Car Vehicle.ClassLuxury SUV
## -4.071515e-01 -3.127519e-02
## Vehicle.ClassSports Car Vehicle.ClassSUV
## 3.149311e-01 2.840871e-01
## Vehicle.ClassTwo-Door Car Vehicle.SizeMedsize
## 6.233181e-02 -2.747114e-01
## Vehicle.SizeSmall
## -6.271449e-01
#Probit model
glm0<-glm(Response~.,family = binomial(link = 'logit'),data = bankruptcy.train)
summary(glm0)
##
## Call:
## glm(formula = Response ~ ., family = binomial(link = "logit"),
## data = bankruptcy.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.32576 -0.56835 -0.37340 -0.00021 3.11850
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.934e+00 5.122e-01 -3.775 0.000160 ***
## StateCalifornia 4.836e-02 1.159e-01 0.417 0.676380
## StateNevada -9.692e-03 1.605e-01 -0.060 0.951845
## StateOregon -2.025e-02 1.207e-01 -0.168 0.866821
## StateWashington -6.128e-02 1.660e-01 -0.369 0.711951
## Customer.Lifetime.Value -6.050e-06 6.443e-06 -0.939 0.347675
## CoverageExtended -6.962e-02 1.537e-01 -0.453 0.650504
## CoveragePremium -7.652e-02 3.238e-01 -0.236 0.813178
## EducationCollege 1.106e-01 1.062e-01 1.042 0.297565
## EducationDoctor 4.536e-01 2.051e-01 2.211 0.027021 *
## EducationHigh School or Below 1.951e-02 1.079e-01 0.181 0.856485
## EducationMaster 3.813e-01 1.555e-01 2.452 0.014205 *
## EmploymentStatusEmployed -2.355e-01 1.921e-01 -1.226 0.220202
## EmploymentStatusMedical Leave 1.111e-01 2.288e-01 0.486 0.627068
## EmploymentStatusRetired 2.508e+00 2.529e-01 9.916 < 2e-16 ***
## EmploymentStatusUnemployed -6.263e-01 1.992e-01 -3.144 0.001669 **
## GenderM 5.920e-02 8.173e-02 0.724 0.468852
## Income 4.149e-06 2.328e-06 1.782 0.074716 .
## Location.CodeSuburban 1.430e+00 1.789e-01 7.993 1.32e-15 ***
## Location.CodeUrban 9.295e-02 1.759e-01 0.528 0.597219
## Marital.StatusMarried -4.718e-01 1.090e-01 -4.327 1.51e-05 ***
## Marital.StatusSingle -4.882e-01 1.293e-01 -3.775 0.000160 ***
## Monthly.Premium.Auto 8.108e-03 6.248e-03 1.298 0.194373
## Months.Since.Last.Claim -4.898e-03 4.103e-03 -1.194 0.232553
## Months.Since.Policy.Inception 2.724e-04 1.450e-03 0.188 0.851032
## Number.of.Open.Complaints -5.366e-02 4.638e-02 -1.157 0.247243
## Number.of.Policies -2.426e-02 1.710e-02 -1.419 0.155943
## Policy.TypePersonal Auto 2.368e-02 1.006e-01 0.235 0.813998
## Policy.TypeSpecial Auto 3.530e-01 2.044e-01 1.727 0.084129 .
## Renew.Offer.TypeOffer2 6.859e-01 8.850e-02 7.751 9.14e-15 ***
## Renew.Offer.TypeOffer3 -2.389e+00 2.675e-01 -8.931 < 2e-16 ***
## Renew.Offer.TypeOffer4 -1.679e+01 2.318e+02 -0.072 0.942264
## Sales.ChannelBranch -5.364e-01 1.008e-01 -5.324 1.01e-07 ***
## Sales.ChannelCall Center -4.070e-01 1.143e-01 -3.562 0.000368 ***
## Sales.ChannelWeb -6.886e-01 1.388e-01 -4.960 7.06e-07 ***
## Total.Claim.Amount -1.479e-03 3.367e-04 -4.392 1.12e-05 ***
## Vehicle.ClassLuxury Car -4.072e-01 8.933e-01 -0.456 0.648550
## Vehicle.ClassLuxury SUV -3.128e-02 8.557e-01 -0.037 0.970843
## Vehicle.ClassSports Car 3.149e-01 3.162e-01 0.996 0.319193
## Vehicle.ClassSUV 2.841e-01 2.799e-01 1.015 0.310060
## Vehicle.ClassTwo-Door Car 6.233e-02 1.078e-01 0.578 0.563177
## Vehicle.SizeMedsize -2.747e-01 1.266e-01 -2.170 0.030018 *
## Vehicle.SizeSmall -6.271e-01 1.530e-01 -4.098 4.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5203.1 on 6392 degrees of freedom
## Residual deviance: 4031.5 on 6350 degrees of freedom
## AIC: 4117.5
##
## Number of Fisher Scoring iterations: 17
bankruptcy_model0_insample <- predict(glm0, type="response")
pred <- prediction(bankruptcy_model0_insample,bankruptcy.train$Response)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is 0.8170277
Testing data
bankruptcy_model0_insample <- predict(glm0, newdata =bankruptcy.test ,type="response")
pred <- prediction(bankruptcy_model0_insample,bankruptcy.test$Response)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is 0.8090875
Model 2 Lasso
dummy<- model.matrix(~ ., data = bankruptcy.data.new)
bankruptcy_data_lasso <- data.frame(dummy[,-1])
bankruptcy.train.X <- as.matrix(select(bankruptcy_data_lasso, -Response)[index,])
bankruptcy.test.X <- as.matrix(select(bankruptcy_data_lasso, -Response)[-index,])
bankruptcy.train.Y <- bankruptcy_data_lasso[index, "Response"]
bankruptcy.test.Y <- bankruptcy_data_lasso[-index, "Response"]
bankruptcy_lasso <- glmnet(x=bankruptcy.train.X, y=bankruptcy.train.Y, family = "binomial")
bankruptcy_lasso_cv <- cv.glmnet(x=bankruptcy.train.X, y=bankruptcy.train.Y, family = "binomial", type.measure = "class")
plot(bankruptcy_lasso_cv)
par(mfrow=c(1,1))
coef(bankruptcy_lasso, s=bankruptcy_lasso_cv$lambda.min)
## 43 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.811631e+00
## StateCalifornia .
## StateNevada .
## StateOregon .
## StateWashington .
## Customer.Lifetime.Value .
## CoverageExtended .
## CoveragePremium .
## EducationCollege .
## EducationDoctor 1.935424e-01
## EducationHigh.School.or.Below .
## EducationMaster 1.854072e-01
## EmploymentStatusEmployed .
## EmploymentStatusMedical.Leave 4.950095e-02
## EmploymentStatusRetired 2.456598e+00
## EmploymentStatusUnemployed -5.806465e-01
## GenderM .
## Income 5.080228e-07
## Location.CodeSuburban 8.330149e-01
## Location.CodeUrban -1.098303e-01
## Marital.StatusMarried -2.341657e-01
## Marital.StatusSingle -2.912739e-01
## Monthly.Premium.Auto 5.495355e-05
## Months.Since.Last.Claim -1.644167e-03
## Months.Since.Policy.Inception .
## Number.of.Open.Complaints -5.522565e-03
## Number.of.Policies -1.047594e-02
## Policy.TypePersonal.Auto .
## Policy.TypeSpecial.Auto 1.183639e-01
## Renew.Offer.TypeOffer2 6.036038e-01
## Renew.Offer.TypeOffer3 -1.875267e+00
## Renew.Offer.TypeOffer4 -2.712024e+00
## Sales.ChannelBranch -3.516541e-01
## Sales.ChannelCall.Center -2.149748e-01
## Sales.ChannelWeb -4.546039e-01
## Total.Claim.Amount -2.705172e-04
## Vehicle.ClassLuxury.Car .
## Vehicle.ClassLuxury.SUV .
## Vehicle.ClassSports.Car 2.161568e-01
## Vehicle.ClassSUV 2.308456e-01
## Vehicle.ClassTwo.Door.Car .
## Vehicle.SizeMedsize .
## Vehicle.SizeSmall -2.841743e-01
coef(bankruptcy_lasso, s=bankruptcy_lasso_cv$lambda.1se)
## 43 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.977400124
## StateCalifornia .
## StateNevada .
## StateOregon .
## StateWashington .
## Customer.Lifetime.Value .
## CoverageExtended .
## CoveragePremium .
## EducationCollege .
## EducationDoctor .
## EducationHigh.School.or.Below .
## EducationMaster .
## EmploymentStatusEmployed .
## EmploymentStatusMedical.Leave .
## EmploymentStatusRetired 2.032512167
## EmploymentStatusUnemployed .
## GenderM .
## Income .
## Location.CodeSuburban 0.008787571
## Location.CodeUrban .
## Marital.StatusMarried .
## Marital.StatusSingle .
## Monthly.Premium.Auto .
## Months.Since.Last.Claim .
## Months.Since.Policy.Inception .
## Number.of.Open.Complaints .
## Number.of.Policies .
## Policy.TypePersonal.Auto .
## Policy.TypeSpecial.Auto .
## Renew.Offer.TypeOffer2 0.384398291
## Renew.Offer.TypeOffer3 -0.344749739
## Renew.Offer.TypeOffer4 -0.329119235
## Sales.ChannelBranch .
## Sales.ChannelCall.Center .
## Sales.ChannelWeb .
## Total.Claim.Amount .
## Vehicle.ClassLuxury.Car .
## Vehicle.ClassLuxury.SUV .
## Vehicle.ClassSports.Car .
## Vehicle.ClassSUV .
## Vehicle.ClassTwo.Door.Car .
## Vehicle.SizeMedsize .
## Vehicle.SizeSmall .
pred.lasso.train<- predict(bankruptcy_lasso, newx=bankruptcy.train.X, s=bankruptcy_lasso_cv$lambda.min, type = "response")
# out-of-sample prediction
pred.lasso.test<- predict(bankruptcy_lasso, newx=bankruptcy.test.X, s=bankruptcy_lasso_cv$lambda.min, type = "response")
pred <- prediction(pred.lasso.train,bankruptcy.train.Y)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is 0.8094227
pred <- prediction(pred.lasso.test,bankruptcy.test.Y)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is 0.8025042
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: