Multiple Logistic Regression

Claimants Insurance Dataset

Implementation

getwd()

## [1] "C:/Users/amits/Desktop/sconcept/datascience training/assignment"

setwd("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment")
claimants <- read.csv(file.choose())
View(claimants)
attach(claimants)
summary(claimants)

##     CASENUM         ATTORNEY          CLMSEX          CLMINSUR     
##  Min.   :    0   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4177   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median : 8756   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :11202   Mean   :0.4888   Mean   :0.5587   Mean   :0.9076  
##  3rd Qu.:15702   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :34153   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                   NA's   :12       NA's   :41      
##     SEATBELT           CLMAGE           LOSS        
##  Min.   :0.00000   Min.   : 0.00   Min.   :  0.000  
##  1st Qu.:0.00000   1st Qu.: 9.00   1st Qu.:  0.400  
##  Median :0.00000   Median :30.00   Median :  1.069  
##  Mean   :0.01703   Mean   :28.41   Mean   :  3.806  
##  3rd Qu.:0.00000   3rd Qu.:43.00   3rd Qu.:  3.781  
##  Max.   :1.00000   Max.   :95.00   Max.   :173.604  
##  NA's   :48        NA's   :189

logistic regression

a<-na.omit(claimants)
summary(a)

##     CASENUM         ATTORNEY          CLMSEX          CLMINSUR     
##  Min.   :    0   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4503   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median : 8730   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :11244   Mean   :0.4726   Mean   :0.5648   Mean   :0.9042  
##  3rd Qu.:16013   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :34153   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     SEATBELT           CLMAGE           LOSS        
##  Min.   :0.00000   Min.   : 0.00   Min.   :  0.000  
##  1st Qu.:0.00000   1st Qu.: 9.00   1st Qu.:  0.440  
##  Median :0.00000   Median :30.00   Median :  1.311  
##  Mean   :0.01825   Mean   :28.59   Mean   :  3.857  
##  3rd Qu.:0.00000   3rd Qu.:43.00   3rd Qu.:  3.910  
##  Max.   :1.00000   Max.   :95.00   Max.   :173.604

colnames(claimants)

## [1] "CASENUM"  "ATTORNEY" "CLMSEX"   "CLMINSUR" "SEATBELT" "CLMAGE"  
## [7] "LOSS"

str(claimants)

## 'data.frame':    1340 obs. of  7 variables:
##  $ CASENUM : int  5 3 66 70 96 97 10 36 51 55 ...
##  $ ATTORNEY: int  0 1 1 0 1 0 0 0 1 1 ...
##  $ CLMSEX  : int  0 1 0 0 0 1 0 1 1 0 ...
##  $ CLMINSUR: int  1 0 1 1 1 1 1 1 1 1 ...
##  $ SEATBELT: int  0 0 0 1 0 0 0 0 0 0 ...
##  $ CLMAGE  : int  50 18 5 31 30 35 9 34 60 NA ...
##  $ LOSS    : num  34.94 0.891 0.33 0.037 0.038 ...

str(ATTORNEY)

##  int [1:1340] 0 1 1 0 1 0 0 0 1 1 ...

str(CLMSEX)

##  int [1:1340] 0 1 0 0 0 1 0 1 1 0 ...

str(as.factor(CLMSEX))

##  Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 2 1 ...

rm(a)

glm - genralised linear model

m1 <-glm(ATTORNEY ~ factor(CLMSEX)+ factor(CLMINSUR)+factor(SEATBELT) + CLMAGE +LOSS, family = binomial, data=claimants)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

m2 <- glm(ATTORNEY ~ ., family=binomial, data = claimants)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

coef(m1)

##       (Intercept)   factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1 
##      -0.199977705       0.432996012       0.602173170      -0.781078544 
##            CLMAGE              LOSS 
##       0.006487375      -0.385043853

summary(m1)

## 
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + 
##     factor(SEATBELT) + CLMAGE + LOSS, family = binomial, data = claimants)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.74474  -1.01055  -0.02547   0.95764   2.78320  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.199978   0.246769  -0.810  0.41772    
## factor(CLMSEX)1    0.432996   0.135706   3.191  0.00142 ** 
## factor(CLMINSUR)1  0.602173   0.231030   2.606  0.00915 ** 
## factor(SEATBELT)1 -0.781079   0.566125  -1.380  0.16768    
## CLMAGE             0.006487   0.003324   1.952  0.05097 .  
## LOSS              -0.385044   0.034845 -11.050  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1516.1  on 1095  degrees of freedom
## Residual deviance: 1287.8  on 1090  degrees of freedom
##   (244 observations deleted due to missingness)
## AIC: 1299.8
## 
## Number of Fisher Scoring iterations: 6

exp(coef(m1)) # odds ratio

##       (Intercept)   factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1 
##         0.8187490         1.5418701         1.8260829         0.4579119 
##            CLMAGE              LOSS 
##         1.0065085         0.6804208

prediction

prob <- predict(m1,claimants)
prob <- predict(m1,type=c("response"),claimants)
head(prob)

##            1            2            3            4            5 
## 2.970231e-06 5.016792e-01 5.762915e-01 4.521417e-01 6.415700e-01 
##            6 
## 7.197599e-01

pvprob <- as.data.frame(prob)
final <- cbind(pvprob, claimants)
View(final)
dim(final)

## [1] 1340    8

table(ATTORNEY) #uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.

## ATTORNEY
##   0   1 
## 685 655

table(prob > 0.5)

## 
## FALSE  TRUE 
##   505   591

confusion <- table(prob >0.5, ATTORNEY)
mode(ATTORNEY)

## [1] "numeric"

table(CLMAGE)

## CLMAGE
##  0  1  3  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19 30 31 33 34 35 36 37 
## 45 15 16 13 32 46 41 45 42 61 30 22 16 21 26 22 14 27 45 17 27 26 24 20 23 
## 38 39 40 41 43 44 45 46 47 48 49 50 51 53 54 55 56 57 58 59 60 61 63 64 65 
## 24 29 55 23 24 18 15 16 16 18 21 30  6  7 14 12  7  7  8  5 12  6  5  5  2 
## 66 67 68 69 70 71 73 74 75 76 77 78 80 83 84 86 88 95 
##  4  2  2  5 11  3  3  1  1  2  3  3  4  2  1  1  1  1

MLR_logistic_claimants

amit

9/7/2019

Multiple Logistic Regression

Claimants Insurance Dataset

Implementation

logistic regression

glm - genralised linear model

prediction