Multiple Logistic Regression

Claimants Insurance Dataset

Implementation

getwd()
## [1] "E:/Excelr DS/R _Codes/Logistic Regression"
claimants<-read.csv("claimants.csv")
claimants <- read.csv(file.choose()) # Choose the claimants Data set
View(claimants)
attach(claimants)
summary(claimants)
##     CASENUM         ATTORNEY          CLMSEX          CLMINSUR     
##  Min.   :    0   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4177   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median : 8756   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :11202   Mean   :0.4888   Mean   :0.5587   Mean   :0.9076  
##  3rd Qu.:15702   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :34153   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                   NA's   :12       NA's   :41      
##     SEATBELT           CLMAGE           LOSS        
##  Min.   :0.00000   Min.   : 0.00   Min.   :  0.000  
##  1st Qu.:0.00000   1st Qu.: 9.00   1st Qu.:  0.400  
##  Median :0.00000   Median :30.00   Median :  1.069  
##  Mean   :0.01703   Mean   :28.41   Mean   :  3.806  
##  3rd Qu.:0.00000   3rd Qu.:43.00   3rd Qu.:  3.781  
##  Max.   :1.00000   Max.   :95.00   Max.   :173.604  
##  NA's   :48        NA's   :189
# Linear Regression
fit <- lm(ATTORNEY~factor(CLMSEX)+factor(CLMINSUR)+CLMAGE+LOSS)
summary(fit)
## 
## Call:
## lm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + CLMAGE + 
##     LOSS)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5799 -0.4768 -0.1769  0.4693  1.3120 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.3367496  0.0521232   6.461 1.56e-10 ***
## factor(CLMSEX)1    0.0825409  0.0293292   2.814  0.00497 ** 
## factor(CLMINSUR)1  0.1326489  0.0491720   2.698  0.00709 ** 
## CLMAGE             0.0004053  0.0007154   0.567  0.57113    
## LOSS              -0.0108442  0.0013994  -7.749 2.08e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4839 on 1110 degrees of freedom
##   (225 observations deleted due to missingness)
## Multiple R-squared:  0.06478,    Adjusted R-squared:  0.06141 
## F-statistic: 19.22 on 4 and 1110 DF,  p-value: 2.652e-15
# Linear regression technique can not be employed
# Logistic Regression
colnames(claimants)
## [1] "CASENUM"  "ATTORNEY" "CLMSEX"   "CLMINSUR" "SEATBELT" "CLMAGE"  
## [7] "LOSS"
str(as.factor(CLMSEX))
##  Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 2 1 ...
logit <- glm(ATTORNEY ~ factor(CLMSEX)+factor(CLMINSUR)+factor(SEATBELT)+CLMAGE+LOSS, family=binomial,data = claimants)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
## 
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + 
##     factor(SEATBELT) + CLMAGE + LOSS, family = binomial, data = claimants)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.74474  -1.01055  -0.02547   0.95764   2.78320  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.199978   0.246769  -0.810  0.41772    
## factor(CLMSEX)1    0.432996   0.135706   3.191  0.00142 ** 
## factor(CLMINSUR)1  0.602173   0.231030   2.606  0.00915 ** 
## factor(SEATBELT)1 -0.781079   0.566125  -1.380  0.16768    
## CLMAGE             0.006487   0.003324   1.952  0.05097 .  
## LOSS              -0.385044   0.034845 -11.050  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1516.1  on 1095  degrees of freedom
## Residual deviance: 1287.8  on 1090  degrees of freedom
##   (244 observations deleted due to missingness)
## AIC: 1299.8
## 
## Number of Fisher Scoring iterations: 6
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
x<-stepAIC(logit)
## Start:  AIC=1299.85
## ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + factor(SEATBELT) + 
##     CLMAGE + LOSS
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## <none>                  1287.8 1299.8
## - factor(SEATBELT)  1   1289.9 1299.9
## - CLMAGE            1   1291.7 1301.7
## - factor(CLMINSUR)  1   1294.8 1304.8
## - factor(CLMSEX)    1   1298.1 1308.1
## - LOSS              1   1494.3 1504.3
vif(logit)
##   factor(CLMSEX) factor(CLMINSUR) factor(SEATBELT)           CLMAGE 
##         1.010314         1.006588         1.002524         1.027312 
##             LOSS 
##         1.028889
# Odds Ratio
exp(coef(logit))
##       (Intercept)   factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1 
##         0.8187490         1.5418701         1.8260829         0.4579119 
##            CLMAGE              LOSS 
##         1.0065085         0.6804208
# Confusion matrix table 
prob <- predict(logit,type=c("response"),claimants)
prob<-as.data.frame(prob)
final <- cbind(prob,claimants)
confusion <- table(prob > 0.5,claimants$ATTORNEY)
table(prob > 0.5)
## 
## FALSE  TRUE 
##   505   591
confusion
##        
##           0   1
##   FALSE 380 125
##   TRUE  198 393
# Model Accuracy 
Accuracy <- sum(diag(confusion)/sum(confusion))
Accuracy
## [1] 0.705292
# ROC Curve 
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rocrpred<-prediction(prob,claimants$ATTORNEY)
rocrperf<-performance(rocrpred,'tpr','fpr')
plot(rocrperf,colorize=T,text.adj=c(-0.2,1.7))

# More area under the ROC Curve better is the logistic regression model obtained