#Import all the libs
library('PerformanceAnalytics')
## Warning: package 'PerformanceAnalytics' was built under R version 3.6.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.6.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.6.3
## Loading required package: ggplot2
library('ggplot2')
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:PerformanceAnalytics':
## 
##     textplot
## The following object is masked from 'package:stats':
## 
##     lowess
library('tidyverse')
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   0.8.3     v dplyr   0.8.5
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.1.3     v forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::first()  masks xts::first()
## x dplyr::lag()    masks stats::lag()
## x dplyr::last()   masks xts::last()
library('car')
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library("glmnet")
## Warning: package 'glmnet' was built under R version 3.6.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loaded glmnet 3.0-2
library("dplyr")
library('rpart')
library('tidyverse')
library('corrgram')
## Warning: package 'corrgram' was built under R version 3.6.2
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
library('glmnet')
library('boot')
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit

Load the data

bankruptcy.data<-read.csv("~/Assignment/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")
head(bankruptcy.data)
##   Customer      State Customer.Lifetime.Value Response Coverage Education
## 1  BU79786 Washington                2763.519       No    Basic  Bachelor
## 2  QZ44356    Arizona                6979.536       No Extended  Bachelor
## 3  AI49188     Nevada               12887.432       No  Premium  Bachelor
## 4  WW63253 California                7645.862       No    Basic  Bachelor
## 5  HB64268 Washington                2813.693       No    Basic  Bachelor
## 6  OC83172     Oregon                8256.298      Yes    Basic  Bachelor
##   Effective.To.Date EmploymentStatus Gender Income Location.Code
## 1           2/24/11         Employed      F  56274      Suburban
## 2           1/31/11       Unemployed      F      0      Suburban
## 3           2/19/11         Employed      F  48767      Suburban
## 4           1/20/11       Unemployed      M      0      Suburban
## 5            2/3/11         Employed      M  43836         Rural
## 6           1/25/11         Employed      F  62902         Rural
##   Marital.Status Monthly.Premium.Auto Months.Since.Last.Claim
## 1        Married                   69                      32
## 2         Single                   94                      13
## 3        Married                  108                      18
## 4        Married                  106                      18
## 5         Single                   73                      12
## 6        Married                   69                      14
##   Months.Since.Policy.Inception Number.of.Open.Complaints
## 1                             5                         0
## 2                            42                         0
## 3                            38                         0
## 4                            65                         0
## 5                            44                         0
## 6                            94                         0
##   Number.of.Policies    Policy.Type       Policy Renew.Offer.Type
## 1                  1 Corporate Auto Corporate L3           Offer1
## 2                  8  Personal Auto  Personal L3           Offer3
## 3                  2  Personal Auto  Personal L3           Offer1
## 4                  7 Corporate Auto Corporate L2           Offer1
## 5                  1  Personal Auto  Personal L1           Offer1
## 6                  2  Personal Auto  Personal L3           Offer2
##   Sales.Channel Total.Claim.Amount Vehicle.Class Vehicle.Size
## 1         Agent           384.8111  Two-Door Car      Medsize
## 2         Agent          1131.4649 Four-Door Car      Medsize
## 3         Agent           566.4722  Two-Door Car      Medsize
## 4   Call Center           529.8813           SUV      Medsize
## 5         Agent           138.1309 Four-Door Car      Medsize
## 6           Web           159.3830  Two-Door Car      Medsize

Data Exploration Check the datatype of the columns

sapply(bankruptcy.data, class)
##                      Customer                         State 
##                      "factor"                      "factor" 
##       Customer.Lifetime.Value                      Response 
##                     "numeric"                      "factor" 
##                      Coverage                     Education 
##                      "factor"                      "factor" 
##             Effective.To.Date              EmploymentStatus 
##                      "factor"                      "factor" 
##                        Gender                        Income 
##                      "factor"                     "integer" 
##                 Location.Code                Marital.Status 
##                      "factor"                      "factor" 
##          Monthly.Premium.Auto       Months.Since.Last.Claim 
##                     "integer"                     "integer" 
## Months.Since.Policy.Inception     Number.of.Open.Complaints 
##                     "integer"                     "integer" 
##            Number.of.Policies                   Policy.Type 
##                     "integer"                      "factor" 
##                        Policy              Renew.Offer.Type 
##                      "factor"                      "factor" 
##                 Sales.Channel            Total.Claim.Amount 
##                      "factor"                     "numeric" 
##                 Vehicle.Class                  Vehicle.Size 
##                      "factor"                      "factor"
summary(bankruptcy.data)
##     Customer           State      Customer.Lifetime.Value Response  
##  AA10041:   1   Arizona   :1703   Min.   : 1898           No :7826  
##  AA11235:   1   California:3150   1st Qu.: 3994           Yes:1308  
##  AA16582:   1   Nevada    : 882   Median : 5780                     
##  AA30683:   1   Oregon    :2601   Mean   : 8005                     
##  AA34092:   1   Washington: 798   3rd Qu.: 8962                     
##  AA35519:   1                     Max.   :83325                     
##  (Other):9128                                                       
##      Coverage                   Education    Effective.To.Date
##  Basic   :5568   Bachelor            :2748   1/10/11: 195     
##  Extended:2742   College             :2681   1/27/11: 194     
##  Premium : 824   Doctor              : 342   2/14/11: 186     
##                  High School or Below:2622   1/26/11: 181     
##                  Master              : 741   1/17/11: 180     
##                                              1/19/11: 179     
##                                              (Other):8019     
##       EmploymentStatus Gender       Income       Location.Code 
##  Disabled     : 405    F:4658   Min.   :    0   Rural   :1773  
##  Employed     :5698    M:4476   1st Qu.:    0   Suburban:5779  
##  Medical Leave: 432             Median :33890   Urban   :1582  
##  Retired      : 282             Mean   :37657                  
##  Unemployed   :2317             3rd Qu.:62320                  
##                                 Max.   :99981                  
##                                                                
##   Marital.Status Monthly.Premium.Auto Months.Since.Last.Claim
##  Divorced:1369   Min.   : 61.00       Min.   : 0.0           
##  Married :5298   1st Qu.: 68.00       1st Qu.: 6.0           
##  Single  :2467   Median : 83.00       Median :14.0           
##                  Mean   : 93.22       Mean   :15.1           
##                  3rd Qu.:109.00       3rd Qu.:23.0           
##                  Max.   :298.00       Max.   :35.0           
##                                                              
##  Months.Since.Policy.Inception Number.of.Open.Complaints
##  Min.   : 0.00                 Min.   :0.0000           
##  1st Qu.:24.00                 1st Qu.:0.0000           
##  Median :48.00                 Median :0.0000           
##  Mean   :48.06                 Mean   :0.3844           
##  3rd Qu.:71.00                 3rd Qu.:0.0000           
##  Max.   :99.00                 Max.   :5.0000           
##                                                         
##  Number.of.Policies         Policy.Type            Policy    
##  Min.   :1.000      Corporate Auto:1968   Personal L3 :3426  
##  1st Qu.:1.000      Personal Auto :6788   Personal L2 :2122  
##  Median :2.000      Special Auto  : 378   Personal L1 :1240  
##  Mean   :2.966                            Corporate L3:1014  
##  3rd Qu.:4.000                            Corporate L2: 595  
##  Max.   :9.000                            Corporate L1: 359  
##                                           (Other)     : 378  
##  Renew.Offer.Type     Sales.Channel  Total.Claim.Amount
##  Offer1:3752      Agent      :3477   Min.   :   0.099  
##  Offer2:2926      Branch     :2567   1st Qu.: 272.258  
##  Offer3:1432      Call Center:1765   Median : 383.945  
##  Offer4:1024      Web        :1325   Mean   : 434.089  
##                                      3rd Qu.: 547.515  
##                                      Max.   :2893.240  
##                                                        
##        Vehicle.Class   Vehicle.Size 
##  Four-Door Car:4621   Large  : 946  
##  Luxury Car   : 163   Medsize:6424  
##  Luxury SUV   : 184   Small  :1764  
##  Sports Car   : 484                 
##  SUV          :1796                 
##  Two-Door Car :1886                 
## 
str(bankruptcy.data)
## 'data.frame':    9134 obs. of  24 variables:
##  $ Customer                     : Factor w/ 9134 levels "AA10041","AA11235",..: 601 5947 97 8017 2489 4948 8434 756 1352 548 ...
##  $ State                        : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
##  $ Customer.Lifetime.Value      : num  2764 6980 12887 7646 2814 ...
##  $ Response                     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ Coverage                     : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
##  $ Education                    : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
##  $ Effective.To.Date            : Factor w/ 59 levels "1/1/11","1/10/11",..: 48 25 42 13 53 18 48 10 19 40 ...
##  $ EmploymentStatus             : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
##  $ Gender                       : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
##  $ Income                       : int  56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
##  $ Location.Code                : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
##  $ Marital.Status               : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
##  $ Monthly.Premium.Auto         : int  69 94 108 106 73 69 67 101 71 93 ...
##  $ Months.Since.Last.Claim      : int  32 13 18 18 12 14 0 0 13 17 ...
##  $ Months.Since.Policy.Inception: int  5 42 38 65 44 94 13 68 3 7 ...
##  $ Number.of.Open.Complaints    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Number.of.Policies           : int  1 8 2 7 1 2 9 4 2 8 ...
##  $ Policy.Type                  : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
##  $ Policy                       : Factor w/ 9 levels "Corporate L1",..: 3 6 6 2 4 6 3 3 3 8 ...
##  $ Renew.Offer.Type             : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
##  $ Sales.Channel                : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
##  $ Total.Claim.Amount           : num  385 1131 566 530 138 ...
##  $ Vehicle.Class                : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
##  $ Vehicle.Size                 : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
glimpse(bankruptcy.data)
## Observations: 9,134
## Variables: 24
## $ Customer                      <fct> BU79786, QZ44356, AI49188, WW632...
## $ State                         <fct> Washington, Arizona, Nevada, Cal...
## $ Customer.Lifetime.Value       <dbl> 2763.519, 6979.536, 12887.432, 7...
## $ Response                      <fct> No, No, No, No, No, Yes, Yes, No...
## $ Coverage                      <fct> Basic, Extended, Premium, Basic,...
## $ Education                     <fct> Bachelor, Bachelor, Bachelor, Ba...
## $ Effective.To.Date             <fct> 2/24/11, 1/31/11, 2/19/11, 1/20/...
## $ EmploymentStatus              <fct> Employed, Unemployed, Employed, ...
## $ Gender                        <fct> F, F, F, M, M, F, F, M, M, F, M,...
## $ Income                        <int> 56274, 0, 48767, 0, 43836, 62902...
## $ Location.Code                 <fct> Suburban, Suburban, Suburban, Su...
## $ Marital.Status                <fct> Married, Single, Married, Marrie...
## $ Monthly.Premium.Auto          <int> 69, 94, 108, 106, 73, 69, 67, 10...
## $ Months.Since.Last.Claim       <int> 32, 13, 18, 18, 12, 14, 0, 0, 13...
## $ Months.Since.Policy.Inception <int> 5, 42, 38, 65, 44, 94, 13, 68, 3...
## $ Number.of.Open.Complaints     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Number.of.Policies            <int> 1, 8, 2, 7, 1, 2, 9, 4, 2, 8, 3,...
## $ Policy.Type                   <fct> Corporate Auto, Personal Auto, P...
## $ Policy                        <fct> Corporate L3, Personal L3, Perso...
## $ Renew.Offer.Type              <fct> Offer1, Offer3, Offer1, Offer1, ...
## $ Sales.Channel                 <fct> Agent, Agent, Agent, Call Center...
## $ Total.Claim.Amount            <dbl> 384.81115, 1131.46493, 566.47225...
## $ Vehicle.Class                 <fct> Two-Door Car, Four-Door Car, Two...
## $ Vehicle.Size                  <fct> Medsize, Medsize, Medsize, Medsi...
# Use sapply() function to count the number of observations with each feature that contains.
sapply(bankruptcy.data, function(x) sum(is.na(x)))
##                      Customer                         State 
##                             0                             0 
##       Customer.Lifetime.Value                      Response 
##                             0                             0 
##                      Coverage                     Education 
##                             0                             0 
##             Effective.To.Date              EmploymentStatus 
##                             0                             0 
##                        Gender                        Income 
##                             0                             0 
##                 Location.Code                Marital.Status 
##                             0                             0 
##          Monthly.Premium.Auto       Months.Since.Last.Claim 
##                             0                             0 
## Months.Since.Policy.Inception     Number.of.Open.Complaints 
##                             0                             0 
##            Number.of.Policies                   Policy.Type 
##                             0                             0 
##                        Policy              Renew.Offer.Type 
##                             0                             0 
##                 Sales.Channel            Total.Claim.Amount 
##                             0                             0 
##                 Vehicle.Class                  Vehicle.Size 
##                             0                             0
#Similarly, the number of unique observations per column is revealed below.
sapply(bankruptcy.data, function(x) length(unique(x)))
##                      Customer                         State 
##                          9134                             5 
##       Customer.Lifetime.Value                      Response 
##                          8041                             2 
##                      Coverage                     Education 
##                             3                             5 
##             Effective.To.Date              EmploymentStatus 
##                            59                             5 
##                        Gender                        Income 
##                             2                          5694 
##                 Location.Code                Marital.Status 
##                             3                             3 
##          Monthly.Premium.Auto       Months.Since.Last.Claim 
##                           202                            36 
## Months.Since.Policy.Inception     Number.of.Open.Complaints 
##                           100                             6 
##            Number.of.Policies                   Policy.Type 
##                             9                             3 
##                        Policy              Renew.Offer.Type 
##                             9                             4 
##                 Sales.Channel            Total.Claim.Amount 
##                             4                          5106 
##                 Vehicle.Class                  Vehicle.Size 
##                             6                             3

Using the missmap() function under the Amelia package, the visualization of the amount of missing and observed values per features is observed below. Most information in the Cabin and Age features are missing in both datasets.

library(Amelia)
## Warning: package 'Amelia' was built under R version 3.6.3
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(bankruptcy.data, main = "Missing Values vs. Observed")

Your data contains 9134 customers with information about their income, education, gender,residence and so on. Each customer owns a car and you as entrepreneur offers 4 different car insurances to them. The target of this dataset is the Response. The response can be “Yes” - the customer accept the offer and “No” - the customer didn´t accept the offer.

Graphs

# Relation between numerical variables
nums <- unlist(lapply(bankruptcy.data, is.numeric)) 
bankruptcy_numeric<-bankruptcy.data[,nums]
corr<-cor(bankruptcy_numeric)

library("PerformanceAnalytics")
#my_data <- bankruptcy_numeric
chart.Correlation(corr, histogram=TRUE, pch=19)

library(ggcorrplot)
ggcorrplot(corr, hc.order = TRUE, type = "lower",lab = TRUE)

# Exploratory Data Analysis
# Relation between categorial variables and response variable

# Gender - > Response
library(ggcorrplot)
tbl_gen <- with(bankruptcy.data, table(Gender, Response))
ggplot(as.data.frame(tbl_gen), aes(factor(Response),Freq, fill=Gender) )+ geom_col(position = 'dodge')

# State - > Response
library(ggcorrplot)
tbl_State <- with(bankruptcy.data, table(State, Response))
ggplot(as.data.frame(tbl_State), aes(factor(State),Freq, fill=Response) )+ geom_col(position = 'dodge')

# Coverage -> Response
library(ggcorrplot)
tbl_Coverage <- with(bankruptcy.data, table(Coverage, Response))
ggplot(as.data.frame(tbl_Coverage), aes(factor(Coverage),Freq, fill=Response) )+ geom_col(position = 'dodge')

# Education -> Response
library(ggcorrplot)
tbl_Education <- with(bankruptcy.data, table(Education, Response))
ggplot(as.data.frame(tbl_Coverage), aes(factor(Coverage),Freq, fill=Response) )+ geom_col(position = 'dodge')

# EmploymentStatus   -> Response
library(ggcorrplot)
tbl_EmploymentStatus <- with(bankruptcy.data, table(EmploymentStatus, Response))
ggplot(as.data.frame(tbl_EmploymentStatus), aes(factor(EmploymentStatus),Freq, fill=Response) )+ geom_col(position = 'dodge')

# Income   -> Response
library(ggcorrplot)
tbl_Income <- with(bankruptcy.data, table(Income, Response))
ggplot(as.data.frame(tbl_Income), aes(factor(Response),Freq, fill=Income) )+ geom_col(position = 'dodge')

#Location Code - > Response
library(ggcorrplot)
tbl_LocationCode <- with(bankruptcy.data, table(Location.Code, Response))
ggplot(as.data.frame(tbl_LocationCode), aes(factor(Location.Code),Freq, fill=Response) )+ geom_col(position = 'dodge')

#Marital.Status -> Response
library(ggcorrplot)
tbl_MaritalStatus <- with(bankruptcy.data, table(Marital.Status, Response))
ggplot(as.data.frame(tbl_MaritalStatus), aes(factor(Marital.Status),Freq, fill=Response) )+ geom_col(position = 'dodge')

#Monthly.Premium.Auto -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Monthly.Premium.Auto,fill=Response)) + geom_histogram(position = 'dodge') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Months.Since.Last.Claim  -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Months.Since.Last.Claim ,fill=Response)) + geom_histogram(position = 'dodge') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Months.Since.Policy.Inception  -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Months.Since.Policy.Inception ,fill=Response)) + geom_histogram(position = 'dodge') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Number.of.Open.Complaints  -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Number.of.Open.Complaints  ,fill=Response)) + geom_histogram(position = 'dodge') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Number.of.Policies  -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Number.of.Policies  ,fill=Response)) + geom_histogram(position = 'dodge') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Policy.Type -> Response
library(ggcorrplot)
tbl_PolicyType <- with(bankruptcy.data, table(Policy.Type, Response))
ggplot(as.data.frame(tbl_PolicyType), aes(factor(Policy.Type),Freq, fill=Response) )+ geom_col(position = 'dodge')

#Renew.Offer.Type -> Response
library(ggcorrplot)
tbl_RenewOfferType <- with(bankruptcy.data, table(Renew.Offer.Type, Response))
ggplot(as.data.frame(tbl_RenewOfferType), aes(factor(Renew.Offer.Type),Freq, fill=Response) )+ geom_col(position = 'dodge')

#Sales.Channel -> Response
library(ggcorrplot)
tbl_SalesChannel <- with(bankruptcy.data, table(Sales.Channel, Response))
ggplot(as.data.frame(tbl_SalesChannel), aes(factor(Sales.Channel),Freq, fill=Response) )+ geom_col(position = 'dodge')

#Total.Claim.Amount  -> Response
library(ggcorrplot)
ggplot(bankruptcy.data, aes(x = Total.Claim.Amount  ,fill=Response)) + geom_histogram(position = 'dodge') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Vehicle.Class -> Response
library(ggcorrplot)
tbl_VehicleClass <- with(bankruptcy.data, table(Vehicle.Class, Response))
ggplot(as.data.frame(tbl_VehicleClass), aes(factor(Vehicle.Class),Freq, fill=Response) )+ geom_col(position = 'dodge')

#Vehicle.Size -> Response
library(ggcorrplot)
tbl_VehicleSize <- with(bankruptcy.data, table(Vehicle.Size, Response))
ggplot(as.data.frame(tbl_VehicleSize), aes(factor(Vehicle.Size),Freq, fill=Response) )+ geom_col(position = 'dodge')

Data Wrangling - cleaning All categorial features are well distributet, so I will keep them and encode them to numerical data. Some columns don´t make sense or are not so important, e.g. Customer (because it´s just a unique number), Policy is the same as Policy Type, Effective To Date is also not important, so I will drop them. The data is inbalanced regarding the outcome “Response”

bankruptcy.data = subset(bankruptcy.data , select = -c(Customer,Policy,Effective.To.Date) )
str(bankruptcy.data)
## 'data.frame':    9134 obs. of  21 variables:
##  $ State                        : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
##  $ Customer.Lifetime.Value      : num  2764 6980 12887 7646 2814 ...
##  $ Response                     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 2 1 2 1 ...
##  $ Coverage                     : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
##  $ Education                    : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
##  $ EmploymentStatus             : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
##  $ Gender                       : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
##  $ Income                       : int  56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
##  $ Location.Code                : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
##  $ Marital.Status               : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
##  $ Monthly.Premium.Auto         : int  69 94 108 106 73 69 67 101 71 93 ...
##  $ Months.Since.Last.Claim      : int  32 13 18 18 12 14 0 0 13 17 ...
##  $ Months.Since.Policy.Inception: int  5 42 38 65 44 94 13 68 3 7 ...
##  $ Number.of.Open.Complaints    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Number.of.Policies           : int  1 8 2 7 1 2 9 4 2 8 ...
##  $ Policy.Type                  : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
##  $ Renew.Offer.Type             : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
##  $ Sales.Channel                : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
##  $ Total.Claim.Amount           : num  385 1131 566 530 138 ...
##  $ Vehicle.Class                : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
##  $ Vehicle.Size                 : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
#Encode the categorial Data to numerical 
#Step 1
encode_ordinal <- function(x, order = unique(x)) {
  x <- as.numeric(factor(x, levels = order, exclude = NULL))
  x
}

table(bankruptcy.data[["Response"]], encode_ordinal(bankruptcy.data[["Response"]]), useNA = "ifany")
##      
##          1    2
##   No  7826    0
##   Yes    0 1308
bankruptcy.data.new <- bankruptcy.data
bankruptcy.data.new[["Response"]] <- encode_ordinal(bankruptcy.data[["Response"]])
head(bankruptcy.data.new)
##        State Customer.Lifetime.Value Response Coverage Education
## 1 Washington                2763.519        1    Basic  Bachelor
## 2    Arizona                6979.536        1 Extended  Bachelor
## 3     Nevada               12887.432        1  Premium  Bachelor
## 4 California                7645.862        1    Basic  Bachelor
## 5 Washington                2813.693        1    Basic  Bachelor
## 6     Oregon                8256.298        2    Basic  Bachelor
##   EmploymentStatus Gender Income Location.Code Marital.Status
## 1         Employed      F  56274      Suburban        Married
## 2       Unemployed      F      0      Suburban         Single
## 3         Employed      F  48767      Suburban        Married
## 4       Unemployed      M      0      Suburban        Married
## 5         Employed      M  43836         Rural         Single
## 6         Employed      F  62902         Rural        Married
##   Monthly.Premium.Auto Months.Since.Last.Claim
## 1                   69                      32
## 2                   94                      13
## 3                  108                      18
## 4                  106                      18
## 5                   73                      12
## 6                   69                      14
##   Months.Since.Policy.Inception Number.of.Open.Complaints
## 1                             5                         0
## 2                            42                         0
## 3                            38                         0
## 4                            65                         0
## 5                            44                         0
## 6                            94                         0
##   Number.of.Policies    Policy.Type Renew.Offer.Type Sales.Channel
## 1                  1 Corporate Auto           Offer1         Agent
## 2                  8  Personal Auto           Offer3         Agent
## 3                  2  Personal Auto           Offer1         Agent
## 4                  7 Corporate Auto           Offer1   Call Center
## 5                  1  Personal Auto           Offer1         Agent
## 6                  2  Personal Auto           Offer2           Web
##   Total.Claim.Amount Vehicle.Class Vehicle.Size
## 1           384.8111  Two-Door Car      Medsize
## 2          1131.4649 Four-Door Car      Medsize
## 3           566.4722  Two-Door Car      Medsize
## 4           529.8813           SUV      Medsize
## 5           138.1309 Four-Door Car      Medsize
## 6           159.3830  Two-Door Car      Medsize
str(bankruptcy.data.new)
## 'data.frame':    9134 obs. of  21 variables:
##  $ State                        : Factor w/ 5 levels "Arizona","California",..: 5 1 3 2 5 4 4 1 4 4 ...
##  $ Customer.Lifetime.Value      : num  2764 6980 12887 7646 2814 ...
##  $ Response                     : num  1 1 1 1 1 2 2 1 2 1 ...
##  $ Coverage                     : Factor w/ 3 levels "Basic","Extended",..: 1 2 3 1 1 1 1 3 1 2 ...
##  $ Education                    : Factor w/ 5 levels "Bachelor","College",..: 1 1 1 1 1 1 2 5 1 2 ...
##  $ EmploymentStatus             : Factor w/ 5 levels "Disabled","Employed",..: 2 5 2 5 2 2 2 5 3 2 ...
##  $ Gender                       : Factor w/ 2 levels "F","M": 1 1 1 2 2 1 1 2 2 1 ...
##  $ Income                       : int  56274 0 48767 0 43836 62902 55350 0 14072 28812 ...
##  $ Location.Code                : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 2 1 1 2 3 2 3 ...
##  $ Marital.Status               : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 2 3 1 2 ...
##  $ Monthly.Premium.Auto         : int  69 94 108 106 73 69 67 101 71 93 ...
##  $ Months.Since.Last.Claim      : int  32 13 18 18 12 14 0 0 13 17 ...
##  $ Months.Since.Policy.Inception: int  5 42 38 65 44 94 13 68 3 7 ...
##  $ Number.of.Open.Complaints    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Number.of.Policies           : int  1 8 2 7 1 2 9 4 2 8 ...
##  $ Policy.Type                  : Factor w/ 3 levels "Corporate Auto",..: 1 2 2 1 2 2 1 1 1 3 ...
##  $ Renew.Offer.Type             : Factor w/ 4 levels "Offer1","Offer2",..: 1 3 1 1 1 2 1 1 1 2 ...
##  $ Sales.Channel                : Factor w/ 4 levels "Agent","Branch",..: 1 1 1 3 1 4 1 1 1 2 ...
##  $ Total.Claim.Amount           : num  385 1131 566 530 138 ...
##  $ Vehicle.Class                : Factor w/ 6 levels "Four-Door Car",..: 6 1 6 5 1 6 1 1 1 1 ...
##  $ Vehicle.Size                 : Factor w/ 3 levels "Large","Medsize",..: 2 2 2 2 2 2 2 2 2 2 ...
# Correlation Graph

#Analyzing the relationship between feature variables and the target variable
nums_new <- unlist(lapply(bankruptcy.data.new, is.numeric)) 
bankruptcy_numeric_new<-bankruptcy.data.new[,nums_new]
corrnew<-cor(bankruptcy_numeric_new)

library(ggcorrplot)
ggcorrplot(corrnew, hc.order = TRUE, type = "lower",lab = TRUE)

library("ggplot2")
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
melted_cormat <- melt(corrnew)
head(melted_cormat)
##                            Var1                    Var2        value
## 1       Customer.Lifetime.Value Customer.Lifetime.Value  1.000000000
## 2                      Response Customer.Lifetime.Value -0.008929582
## 3                        Income Customer.Lifetime.Value  0.024365661
## 4          Monthly.Premium.Auto Customer.Lifetime.Value  0.396261738
## 5       Months.Since.Last.Claim Customer.Lifetime.Value  0.011516682
## 6 Months.Since.Policy.Inception Customer.Lifetime.Value  0.009418381
library(ggplot2)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()+theme(axis.text.x=element_text(angle = 90)) 

Model building Logistic regression

bankruptcy.data.new$Response[bankruptcy.data.new$Response==1] <- 0
bankruptcy.data.new$Response[bankruptcy.data.new$Response==2] <- 1

bankruptcy.datas <- bankruptcy.data[ , -which(names(bankruptcy.data) %in% c("Customer","Policy","Effective.To.Date"))]

#Split the data
set.seed(13255870)
index <- sample(nrow(bankruptcy.data.new),nrow(bankruptcy.data.new)*0.70)
bankruptcy.train = bankruptcy.data.new[index,]
bankruptcy.test = bankruptcy.data.new[-index,]

str(bankruptcy.train)
## 'data.frame':    6393 obs. of  21 variables:
##  $ State                        : Factor w/ 5 levels "Arizona","California",..: 1 4 4 2 4 1 2 1 4 1 ...
##  $ Customer.Lifetime.Value      : num  4014 5511 8305 2787 8677 ...
##  $ Response                     : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Coverage                     : Factor w/ 3 levels "Basic","Extended",..: 2 1 2 1 1 2 3 1 1 2 ...
##  $ Education                    : Factor w/ 5 levels "Bachelor","College",..: 3 4 2 1 4 2 2 5 2 4 ...
##  $ EmploymentStatus             : Factor w/ 5 levels "Disabled","Employed",..: 2 5 5 2 2 2 2 2 5 2 ...
##  $ Gender                       : Factor w/ 2 levels "F","M": 1 1 2 2 2 2 1 1 1 2 ...
##  $ Income                       : int  37384 0 0 38667 76214 25899 92850 51199 0 53603 ...
##  $ Location.Code                : Factor w/ 3 levels "Rural","Suburban",..: 2 2 2 1 3 2 2 2 2 2 ...
##  $ Marital.Status               : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 3 2 1 3 1 3 2 ...
##  $ Monthly.Premium.Auto         : int  99 73 122 72 72 79 104 74 72 132 ...
##  $ Months.Since.Last.Claim      : int  9 24 22 8 7 10 3 19 15 30 ...
##  $ Months.Since.Policy.Inception: int  17 57 14 67 48 11 28 52 70 1 ...
##  $ Number.of.Open.Complaints    : int  0 0 2 0 0 0 1 0 0 1 ...
##  $ Number.of.Policies           : int  1 4 9 1 2 8 2 1 2 1 ...
##  $ Policy.Type                  : Factor w/ 3 levels "Corporate Auto",..: 2 2 1 2 3 2 2 1 2 2 ...
##  $ Renew.Offer.Type             : Factor w/ 4 levels "Offer1","Offer2",..: 2 1 2 1 2 4 1 1 2 1 ...
##  $ Sales.Channel                : Factor w/ 4 levels "Agent","Branch",..: 1 2 1 1 4 2 1 2 2 2 ...
##  $ Total.Claim.Amount           : num  475 526 681 159 203 ...
##  $ Vehicle.Class                : Factor w/ 6 levels "Four-Door Car",..: 1 1 5 1 1 6 1 1 1 4 ...
##  $ Vehicle.Size                 : Factor w/ 3 levels "Large","Medsize",..: 2 1 3 2 2 2 3 3 2 2 ...

Model building Logistic Regression

bankruptcy.glm<-glm(Response~.,family = binomial,data = bankruptcy.train)
summary(bankruptcy.glm)
## 
## Call:
## glm(formula = Response ~ ., family = binomial, data = bankruptcy.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.32576  -0.56835  -0.37340  -0.00021   3.11850  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.934e+00  5.122e-01  -3.775 0.000160 ***
## StateCalifornia                4.836e-02  1.159e-01   0.417 0.676380    
## StateNevada                   -9.692e-03  1.605e-01  -0.060 0.951845    
## StateOregon                   -2.025e-02  1.207e-01  -0.168 0.866821    
## StateWashington               -6.128e-02  1.660e-01  -0.369 0.711951    
## Customer.Lifetime.Value       -6.050e-06  6.443e-06  -0.939 0.347675    
## CoverageExtended              -6.962e-02  1.537e-01  -0.453 0.650504    
## CoveragePremium               -7.652e-02  3.238e-01  -0.236 0.813178    
## EducationCollege               1.106e-01  1.062e-01   1.042 0.297565    
## EducationDoctor                4.536e-01  2.051e-01   2.211 0.027021 *  
## EducationHigh School or Below  1.951e-02  1.079e-01   0.181 0.856485    
## EducationMaster                3.813e-01  1.555e-01   2.452 0.014205 *  
## EmploymentStatusEmployed      -2.355e-01  1.921e-01  -1.226 0.220202    
## EmploymentStatusMedical Leave  1.111e-01  2.288e-01   0.486 0.627068    
## EmploymentStatusRetired        2.508e+00  2.529e-01   9.916  < 2e-16 ***
## EmploymentStatusUnemployed    -6.263e-01  1.992e-01  -3.144 0.001669 ** 
## GenderM                        5.920e-02  8.173e-02   0.724 0.468852    
## Income                         4.149e-06  2.328e-06   1.782 0.074716 .  
## Location.CodeSuburban          1.430e+00  1.789e-01   7.993 1.32e-15 ***
## Location.CodeUrban             9.295e-02  1.759e-01   0.528 0.597219    
## Marital.StatusMarried         -4.718e-01  1.090e-01  -4.327 1.51e-05 ***
## Marital.StatusSingle          -4.882e-01  1.293e-01  -3.775 0.000160 ***
## Monthly.Premium.Auto           8.108e-03  6.248e-03   1.298 0.194373    
## Months.Since.Last.Claim       -4.898e-03  4.103e-03  -1.194 0.232553    
## Months.Since.Policy.Inception  2.724e-04  1.450e-03   0.188 0.851032    
## Number.of.Open.Complaints     -5.366e-02  4.638e-02  -1.157 0.247243    
## Number.of.Policies            -2.426e-02  1.710e-02  -1.419 0.155943    
## Policy.TypePersonal Auto       2.368e-02  1.006e-01   0.235 0.813998    
## Policy.TypeSpecial Auto        3.530e-01  2.044e-01   1.727 0.084129 .  
## Renew.Offer.TypeOffer2         6.859e-01  8.850e-02   7.751 9.14e-15 ***
## Renew.Offer.TypeOffer3        -2.389e+00  2.675e-01  -8.931  < 2e-16 ***
## Renew.Offer.TypeOffer4        -1.679e+01  2.318e+02  -0.072 0.942264    
## Sales.ChannelBranch           -5.364e-01  1.008e-01  -5.324 1.01e-07 ***
## Sales.ChannelCall Center      -4.070e-01  1.143e-01  -3.562 0.000368 ***
## Sales.ChannelWeb              -6.886e-01  1.388e-01  -4.960 7.06e-07 ***
## Total.Claim.Amount            -1.479e-03  3.367e-04  -4.392 1.12e-05 ***
## Vehicle.ClassLuxury Car       -4.072e-01  8.933e-01  -0.456 0.648550    
## Vehicle.ClassLuxury SUV       -3.128e-02  8.557e-01  -0.037 0.970843    
## Vehicle.ClassSports Car        3.149e-01  3.162e-01   0.996 0.319193    
## Vehicle.ClassSUV               2.841e-01  2.799e-01   1.015 0.310060    
## Vehicle.ClassTwo-Door Car      6.233e-02  1.078e-01   0.578 0.563177    
## Vehicle.SizeMedsize           -2.747e-01  1.266e-01  -2.170 0.030018 *  
## Vehicle.SizeSmall             -6.271e-01  1.530e-01  -4.098 4.16e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5203.1  on 6392  degrees of freedom
## Residual deviance: 4031.5  on 6350  degrees of freedom
## AIC: 4117.5
## 
## Number of Fisher Scoring iterations: 17
#Model Statistics
bankruptcy.glm$coefficients
##                   (Intercept)               StateCalifornia 
##                 -1.933768e+00                  4.836226e-02 
##                   StateNevada                   StateOregon 
##                 -9.691750e-03                 -2.024917e-02 
##               StateWashington       Customer.Lifetime.Value 
##                 -6.128259e-02                 -6.050245e-06 
##              CoverageExtended               CoveragePremium 
##                 -6.962036e-02                 -7.652196e-02 
##              EducationCollege               EducationDoctor 
##                  1.106017e-01                  4.536284e-01 
## EducationHigh School or Below               EducationMaster 
##                  1.951305e-02                  3.813148e-01 
##      EmploymentStatusEmployed EmploymentStatusMedical Leave 
##                 -2.355463e-01                  1.111411e-01 
##       EmploymentStatusRetired    EmploymentStatusUnemployed 
##                  2.508130e+00                 -6.263096e-01 
##                       GenderM                        Income 
##                  5.920365e-02                  4.149380e-06 
##         Location.CodeSuburban            Location.CodeUrban 
##                  1.430268e+00                  9.294908e-02 
##         Marital.StatusMarried          Marital.StatusSingle 
##                 -4.718270e-01                 -4.881936e-01 
##          Monthly.Premium.Auto       Months.Since.Last.Claim 
##                  8.108436e-03                 -4.897842e-03 
## Months.Since.Policy.Inception     Number.of.Open.Complaints 
##                  2.723917e-04                 -5.366044e-02 
##            Number.of.Policies      Policy.TypePersonal Auto 
##                 -2.425882e-02                  2.367814e-02 
##       Policy.TypeSpecial Auto        Renew.Offer.TypeOffer2 
##                  3.529783e-01                  6.859085e-01 
##        Renew.Offer.TypeOffer3        Renew.Offer.TypeOffer4 
##                 -2.389083e+00                 -1.678914e+01 
##           Sales.ChannelBranch      Sales.ChannelCall Center 
##                 -5.364338e-01                 -4.069999e-01 
##              Sales.ChannelWeb            Total.Claim.Amount 
##                 -6.885759e-01                 -1.478793e-03 
##       Vehicle.ClassLuxury Car       Vehicle.ClassLuxury SUV 
##                 -4.071515e-01                 -3.127519e-02 
##       Vehicle.ClassSports Car              Vehicle.ClassSUV 
##                  3.149311e-01                  2.840871e-01 
##     Vehicle.ClassTwo-Door Car           Vehicle.SizeMedsize 
##                  6.233181e-02                 -2.747114e-01 
##             Vehicle.SizeSmall 
##                 -6.271449e-01
#Probit model
glm0<-glm(Response~.,family = binomial(link = 'logit'),data = bankruptcy.train)
summary(glm0)
## 
## Call:
## glm(formula = Response ~ ., family = binomial(link = "logit"), 
##     data = bankruptcy.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.32576  -0.56835  -0.37340  -0.00021   3.11850  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.934e+00  5.122e-01  -3.775 0.000160 ***
## StateCalifornia                4.836e-02  1.159e-01   0.417 0.676380    
## StateNevada                   -9.692e-03  1.605e-01  -0.060 0.951845    
## StateOregon                   -2.025e-02  1.207e-01  -0.168 0.866821    
## StateWashington               -6.128e-02  1.660e-01  -0.369 0.711951    
## Customer.Lifetime.Value       -6.050e-06  6.443e-06  -0.939 0.347675    
## CoverageExtended              -6.962e-02  1.537e-01  -0.453 0.650504    
## CoveragePremium               -7.652e-02  3.238e-01  -0.236 0.813178    
## EducationCollege               1.106e-01  1.062e-01   1.042 0.297565    
## EducationDoctor                4.536e-01  2.051e-01   2.211 0.027021 *  
## EducationHigh School or Below  1.951e-02  1.079e-01   0.181 0.856485    
## EducationMaster                3.813e-01  1.555e-01   2.452 0.014205 *  
## EmploymentStatusEmployed      -2.355e-01  1.921e-01  -1.226 0.220202    
## EmploymentStatusMedical Leave  1.111e-01  2.288e-01   0.486 0.627068    
## EmploymentStatusRetired        2.508e+00  2.529e-01   9.916  < 2e-16 ***
## EmploymentStatusUnemployed    -6.263e-01  1.992e-01  -3.144 0.001669 ** 
## GenderM                        5.920e-02  8.173e-02   0.724 0.468852    
## Income                         4.149e-06  2.328e-06   1.782 0.074716 .  
## Location.CodeSuburban          1.430e+00  1.789e-01   7.993 1.32e-15 ***
## Location.CodeUrban             9.295e-02  1.759e-01   0.528 0.597219    
## Marital.StatusMarried         -4.718e-01  1.090e-01  -4.327 1.51e-05 ***
## Marital.StatusSingle          -4.882e-01  1.293e-01  -3.775 0.000160 ***
## Monthly.Premium.Auto           8.108e-03  6.248e-03   1.298 0.194373    
## Months.Since.Last.Claim       -4.898e-03  4.103e-03  -1.194 0.232553    
## Months.Since.Policy.Inception  2.724e-04  1.450e-03   0.188 0.851032    
## Number.of.Open.Complaints     -5.366e-02  4.638e-02  -1.157 0.247243    
## Number.of.Policies            -2.426e-02  1.710e-02  -1.419 0.155943    
## Policy.TypePersonal Auto       2.368e-02  1.006e-01   0.235 0.813998    
## Policy.TypeSpecial Auto        3.530e-01  2.044e-01   1.727 0.084129 .  
## Renew.Offer.TypeOffer2         6.859e-01  8.850e-02   7.751 9.14e-15 ***
## Renew.Offer.TypeOffer3        -2.389e+00  2.675e-01  -8.931  < 2e-16 ***
## Renew.Offer.TypeOffer4        -1.679e+01  2.318e+02  -0.072 0.942264    
## Sales.ChannelBranch           -5.364e-01  1.008e-01  -5.324 1.01e-07 ***
## Sales.ChannelCall Center      -4.070e-01  1.143e-01  -3.562 0.000368 ***
## Sales.ChannelWeb              -6.886e-01  1.388e-01  -4.960 7.06e-07 ***
## Total.Claim.Amount            -1.479e-03  3.367e-04  -4.392 1.12e-05 ***
## Vehicle.ClassLuxury Car       -4.072e-01  8.933e-01  -0.456 0.648550    
## Vehicle.ClassLuxury SUV       -3.128e-02  8.557e-01  -0.037 0.970843    
## Vehicle.ClassSports Car        3.149e-01  3.162e-01   0.996 0.319193    
## Vehicle.ClassSUV               2.841e-01  2.799e-01   1.015 0.310060    
## Vehicle.ClassTwo-Door Car      6.233e-02  1.078e-01   0.578 0.563177    
## Vehicle.SizeMedsize           -2.747e-01  1.266e-01  -2.170 0.030018 *  
## Vehicle.SizeSmall             -6.271e-01  1.530e-01  -4.098 4.16e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5203.1  on 6392  degrees of freedom
## Residual deviance: 4031.5  on 6350  degrees of freedom
## AIC: 4117.5
## 
## Number of Fisher Scoring iterations: 17
bankruptcy_model0_insample <- predict(glm0, type="response")
pred <- prediction(bankruptcy_model0_insample,bankruptcy.train$Response)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is  0.8170277

Testing data

bankruptcy_model0_insample <- predict(glm0, newdata =bankruptcy.test ,type="response")
pred <- prediction(bankruptcy_model0_insample,bankruptcy.test$Response)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is  0.8090875

Model 2 Lasso

dummy<- model.matrix(~ ., data = bankruptcy.data.new)
bankruptcy_data_lasso <- data.frame(dummy[,-1])
bankruptcy.train.X <- as.matrix(select(bankruptcy_data_lasso, -Response)[index,])
bankruptcy.test.X <- as.matrix(select(bankruptcy_data_lasso, -Response)[-index,])
bankruptcy.train.Y <- bankruptcy_data_lasso[index, "Response"]
bankruptcy.test.Y <- bankruptcy_data_lasso[-index, "Response"]


bankruptcy_lasso <- glmnet(x=bankruptcy.train.X, y=bankruptcy.train.Y, family = "binomial")
bankruptcy_lasso_cv <- cv.glmnet(x=bankruptcy.train.X, y=bankruptcy.train.Y, family = "binomial", type.measure = "class")
plot(bankruptcy_lasso_cv)

par(mfrow=c(1,1))

coef(bankruptcy_lasso, s=bankruptcy_lasso_cv$lambda.min)
## 43 x 1 sparse Matrix of class "dgCMatrix"
##                                           1
## (Intercept)                   -1.811631e+00
## StateCalifornia                .           
## StateNevada                    .           
## StateOregon                    .           
## StateWashington                .           
## Customer.Lifetime.Value        .           
## CoverageExtended               .           
## CoveragePremium                .           
## EducationCollege               .           
## EducationDoctor                1.935424e-01
## EducationHigh.School.or.Below  .           
## EducationMaster                1.854072e-01
## EmploymentStatusEmployed       .           
## EmploymentStatusMedical.Leave  4.950095e-02
## EmploymentStatusRetired        2.456598e+00
## EmploymentStatusUnemployed    -5.806465e-01
## GenderM                        .           
## Income                         5.080228e-07
## Location.CodeSuburban          8.330149e-01
## Location.CodeUrban            -1.098303e-01
## Marital.StatusMarried         -2.341657e-01
## Marital.StatusSingle          -2.912739e-01
## Monthly.Premium.Auto           5.495355e-05
## Months.Since.Last.Claim       -1.644167e-03
## Months.Since.Policy.Inception  .           
## Number.of.Open.Complaints     -5.522565e-03
## Number.of.Policies            -1.047594e-02
## Policy.TypePersonal.Auto       .           
## Policy.TypeSpecial.Auto        1.183639e-01
## Renew.Offer.TypeOffer2         6.036038e-01
## Renew.Offer.TypeOffer3        -1.875267e+00
## Renew.Offer.TypeOffer4        -2.712024e+00
## Sales.ChannelBranch           -3.516541e-01
## Sales.ChannelCall.Center      -2.149748e-01
## Sales.ChannelWeb              -4.546039e-01
## Total.Claim.Amount            -2.705172e-04
## Vehicle.ClassLuxury.Car        .           
## Vehicle.ClassLuxury.SUV        .           
## Vehicle.ClassSports.Car        2.161568e-01
## Vehicle.ClassSUV               2.308456e-01
## Vehicle.ClassTwo.Door.Car      .           
## Vehicle.SizeMedsize            .           
## Vehicle.SizeSmall             -2.841743e-01
coef(bankruptcy_lasso, s=bankruptcy_lasso_cv$lambda.1se)
## 43 x 1 sparse Matrix of class "dgCMatrix"
##                                          1
## (Intercept)                   -1.977400124
## StateCalifornia                .          
## StateNevada                    .          
## StateOregon                    .          
## StateWashington                .          
## Customer.Lifetime.Value        .          
## CoverageExtended               .          
## CoveragePremium                .          
## EducationCollege               .          
## EducationDoctor                .          
## EducationHigh.School.or.Below  .          
## EducationMaster                .          
## EmploymentStatusEmployed       .          
## EmploymentStatusMedical.Leave  .          
## EmploymentStatusRetired        2.032512167
## EmploymentStatusUnemployed     .          
## GenderM                        .          
## Income                         .          
## Location.CodeSuburban          0.008787571
## Location.CodeUrban             .          
## Marital.StatusMarried          .          
## Marital.StatusSingle           .          
## Monthly.Premium.Auto           .          
## Months.Since.Last.Claim        .          
## Months.Since.Policy.Inception  .          
## Number.of.Open.Complaints      .          
## Number.of.Policies             .          
## Policy.TypePersonal.Auto       .          
## Policy.TypeSpecial.Auto        .          
## Renew.Offer.TypeOffer2         0.384398291
## Renew.Offer.TypeOffer3        -0.344749739
## Renew.Offer.TypeOffer4        -0.329119235
## Sales.ChannelBranch            .          
## Sales.ChannelCall.Center       .          
## Sales.ChannelWeb               .          
## Total.Claim.Amount             .          
## Vehicle.ClassLuxury.Car        .          
## Vehicle.ClassLuxury.SUV        .          
## Vehicle.ClassSports.Car        .          
## Vehicle.ClassSUV               .          
## Vehicle.ClassTwo.Door.Car      .          
## Vehicle.SizeMedsize            .          
## Vehicle.SizeSmall              .
pred.lasso.train<- predict(bankruptcy_lasso, newx=bankruptcy.train.X, s=bankruptcy_lasso_cv$lambda.min, type = "response")
# out-of-sample prediction

pred.lasso.test<- predict(bankruptcy_lasso, newx=bankruptcy.test.X, s=bankruptcy_lasso_cv$lambda.min, type = "response")

pred <- prediction(pred.lasso.train,bankruptcy.train.Y)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is  0.8094227
pred <- prediction(pred.lasso.test,bankruptcy.test.Y)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)

# Get Area Under Curve (AUC)
cat('AUC for full model is ',unlist(slot(performance(pred, "auc"), "y.values")))
## AUC for full model is  0.8025042

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: