Logistic Regression in one of the most common algorithm used for classification models. Also know as Logit Model, it is a mathematical model used in statistics to estimate the probability of an event occurring having been given some previous data. The dependent variable should be categorical (binary) where 1 is event and 0 in non event. We will use insurance data today to classify claims as fraudulent or non fraudulent using factors like claim amount, claim type, gender, age and others.

Basic Settings and Data Import

Let’s begin by loading the required libraries and importing the data set we are going to use for this model.

#set working directory
setwd("C:/Users/awani/Documents/GitHub/50daysofAnalytics/Day 4 - Logistic Regression")
options(scipen = 999)

# load required libraries
library(pscl)
library(ggplot2)
library(ROCR)
library(caret)
library(knitr)

#read data
claims = read.csv("Insurance.csv", stringsAsFactors = F)

Data Format Correction

The dependent variable needs to be a factor with level 0 and 1.

#- data format correction
str(claims)
## 'data.frame':    4415 obs. of  19 variables:
##  $ claimid          : int  351069569 806984053 654100160 653220231 226637568 46113373 397237121 917836504 277901331 16290780 ...
##  $ claim_type       : int  3 3 5 1 5 4 1 2 4 5 ...
##  $ uninhabitable    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ claim_amount     : num  192.29 355.9 3.53 33.45 4.03 ...
##  $ fraudulent       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ claim_days       : int  7662 9197 7351 3171 2487 5028 715 4923 10922 697 ...
##  $ coverage         : int  436 925 79 607 119 144 636 254 291 114 ...
##  $ deductible       : int  2000 1000 1000 1000 3000 500 1000 2000 1000 1000 ...
##  $ townsize         : int  1 1 1 1 3 1 2 1 1 3 ...
##  $ gender           : int  1 0 1 0 1 0 0 1 0 1 ...
##  $ age              : int  65 75 69 37 40 78 26 64 60 27 ...
##  $ edcat            : int  2 2 2 3 1 3 4 1 4 1 ...
##  $ work_ex          : int  27 26 4 9 4 39 0 20 11 3 ...
##  $ retire           : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ income           : int  193 203 49 118 18 25 36 13 91 23 ...
##  $ marital          : int  0 0 0 1 0 1 0 1 0 1 ...
##  $ residents        : int  1 1 1 3 1 2 1 2 1 4 ...
##  $ primary_residence: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ occupancy        : int  30 37 31 15 8 15 5 22 40 7 ...
#change categroical or ordinal variables to factor
for ( i in c(2,3,5,9,10,12,14,16,18))
{
  claims[,i]= as.factor(claims[,i])
}

Exploratory Data Analysis

Before we proceed any further, it is essential to perform a basic exploratory data analysis. Since number of frauds is much lesser than non frauds, we have to over-sample by attaching more weight to fraudulent cases.

#- Exploratory Data Analysis

# dependent variable
kable(table(claims$fraudulent),
      col.names = c("Fraud", "Frequency"), align = 'l')
Fraud Frequency
0 3952
1 463
# weight fraudulent claims
claims$wt = ifelse(claims$fraudulent == 1, 6,1)

# independent variable
# Fraud by claim amount and claim days
ggplot(claims, aes(x = claim_amount, y = claim_days, shape = fraudulent, color = fraudulent)) +
  geom_point() +ggtitle("Frauds by Claim Amount and Claim Days")

ggplot(claims, aes(x = claim_days, y = coverage, shape = fraudulent, color = fraudulent)) +
  geom_point() + ggtitle("Frauds by Coverage and Claim Days")

Data Manipluation

We now need to ensure our data is in correct format and split it into training and validation datasets. We have used a 70-30 split here.

set.seed(123)
smp_size = floor(0.7 * nrow(claims))
train_ind = sample(seq_len(nrow(claims)), size = smp_size)

train = claims[train_ind, ]
val = claims[-train_ind, ]

Variable Importance and Model Training

# test run
logit=glm(fraudulent ~ claim_type + uninhabitable + claim_amount + claim_days+
            coverage+deductible+ townsize+ gender + age + edcat + work_ex + retire +
            income+ marital + residents + primary_residence + occupancy ,
            family=binomial(link='logit'),weights = wt, data=train[,2:ncol(train)])

# Importance table
Imp = data.frame(varImp(logit, scale = F))
kable(Imp, align = 'l')
Overall
claim_type2 0.8440359
claim_type3 3.8038264
claim_type4 0.8853922
claim_type5 7.6648722
uninhabitable1 3.2667337
claim_amount 1.5530595
claim_days 0.3763675
coverage 1.1615317
deductible 0.0674526
townsize2 2.0865672
townsize3 4.2513586
gender1 1.0803863
age 0.6572532
edcat2 1.2283635
edcat3 2.1953884
edcat4 2.6577917
edcat5 0.7869859
work_ex 2.7403930
retire1 2.6495043
income 1.9243355
marital1 0.3149944
residents 0.4462148
primary_residence1 1.1057371
occupancy 0.9933360
# final regression with high importance variables
logit=glm(fraudulent ~ claim_type + uninhabitable + claim_amount + 
            townsize+ gender + age + edcat + work_ex + retire +
            income+ occupancy ,
            family=binomial(link='logit'),weights = wt, data=train[,2:ncol(train)])

summary(logit)
## 
## Call:
## glm(formula = fraudulent ~ claim_type + uninhabitable + claim_amount + 
##     townsize + gender + age + edcat + work_ex + retire + income + 
##     occupancy, family = binomial(link = "logit"), data = train[, 
##     2:ncol(train)], weights = wt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5282  -1.1017  -0.9597  -0.8121   4.6413  
## 
## Coefficients:
##                  Estimate Std. Error z value           Pr(>|z|)    
## (Intercept)    -0.5043108  0.1563485  -3.226            0.00126 ** 
## claim_type2    -0.1012973  0.1116121  -0.908            0.36410    
## claim_type3     0.4053767  0.0979752   4.138 0.0000351042956507 ***
## claim_type4    -0.1070217  0.1427998  -0.749            0.45358    
## claim_type5     0.6483539  0.0849280   7.634 0.0000000000000227 ***
## uninhabitable1 -0.3177192  0.1040511  -3.053            0.00226 ** 
## claim_amount    0.0003061  0.0002853   1.073            0.28332    
## townsize2       0.1451192  0.0660778   2.196            0.02808 *  
## townsize3       0.3949889  0.0932168   4.237 0.0000226207192735 ***
## gender1         0.0619078  0.0611838   1.012            0.31162    
## age            -0.0022729  0.0039974  -0.569            0.56962    
## edcat2          0.1174260  0.0954105   1.231            0.21842    
## edcat3          0.2287060  0.1026945   2.227            0.02594 *  
## edcat4          0.2826407  0.1037810   2.723            0.00646 ** 
## edcat5          0.1152895  0.1435295   0.803            0.42183    
## work_ex        -0.0145412  0.0053545  -2.716            0.00661 ** 
## retire1         0.3427254  0.1291621   2.653            0.00797 ** 
## income         -0.0017046  0.0006793  -2.510            0.01209 *  
## occupancy      -0.0084301  0.0049082  -1.718            0.08588 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6356.5  on 3089  degrees of freedom
## Residual deviance: 6120.6  on 3071  degrees of freedom
## AIC: 6158.6
## 
## Number of Fisher Scoring iterations: 5

Model fit indices and plots

#- Model Fit Indices

#predict on test set
p=predict(logit,val,type="response")

# Model performace curve
pred=prediction(p,val$fraudulent)
eval= performance(pred,"acc")
plot(eval)

#roc curve
roc=performance(pred,"tpr","fpr")
plot(roc,main="ROC curve")
abline(a=0,b=1)

#confusion Matrix
p=ifelse(p < 0.5,0,1)
tab=table(factor(p),factor(val$fraudulent))
confusionMatrix(tab)
## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 972 111
##   1 211  31
##                                          
##                Accuracy : 0.757          
##                  95% CI : (0.733, 0.7799)
##     No Information Rate : 0.8928         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0305         
##  Mcnemar's Test P-Value : 0.00000003447  
##                                          
##             Sensitivity : 0.8216         
##             Specificity : 0.2183         
##          Pos Pred Value : 0.8975         
##          Neg Pred Value : 0.1281         
##              Prevalence : 0.8928         
##          Detection Rate : 0.7336         
##    Detection Prevalence : 0.8174         
##       Balanced Accuracy : 0.5200         
##                                          
##        'Positive' Class : 0              
## 
#pseudo R squared
pR2(logit) 
##            llh        llhNull             G2       McFadden           r2ML 
## -3060.31386591 -3178.23255522   235.83737861     0.03710197     0.07348290 
##           r2CU 
##     0.08425228