Multiple Logistic Regression
Claimants Insurance Dataset
Implementation
getwd()
## [1] "C:/Users/amits/Desktop/sconcept/datascience training/assignment"
setwd("C:\\Users\\amits\\Desktop\\sconcept\\datascience training\\assignment")
claimants <- read.csv(file.choose())
View(claimants)
attach(claimants)
summary(claimants)
## CASENUM ATTORNEY CLMSEX CLMINSUR
## Min. : 0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4177 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median : 8756 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :11202 Mean :0.4888 Mean :0.5587 Mean :0.9076
## 3rd Qu.:15702 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :34153 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :12 NA's :41
## SEATBELT CLMAGE LOSS
## Min. :0.00000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 9.00 1st Qu.: 0.400
## Median :0.00000 Median :30.00 Median : 1.069
## Mean :0.01703 Mean :28.41 Mean : 3.806
## 3rd Qu.:0.00000 3rd Qu.:43.00 3rd Qu.: 3.781
## Max. :1.00000 Max. :95.00 Max. :173.604
## NA's :48 NA's :189
logistic regression
a<-na.omit(claimants)
summary(a)
## CASENUM ATTORNEY CLMSEX CLMINSUR
## Min. : 0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4503 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median : 8730 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :11244 Mean :0.4726 Mean :0.5648 Mean :0.9042
## 3rd Qu.:16013 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :34153 Max. :1.0000 Max. :1.0000 Max. :1.0000
## SEATBELT CLMAGE LOSS
## Min. :0.00000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 9.00 1st Qu.: 0.440
## Median :0.00000 Median :30.00 Median : 1.311
## Mean :0.01825 Mean :28.59 Mean : 3.857
## 3rd Qu.:0.00000 3rd Qu.:43.00 3rd Qu.: 3.910
## Max. :1.00000 Max. :95.00 Max. :173.604
colnames(claimants)
## [1] "CASENUM" "ATTORNEY" "CLMSEX" "CLMINSUR" "SEATBELT" "CLMAGE"
## [7] "LOSS"
str(claimants)
## 'data.frame': 1340 obs. of 7 variables:
## $ CASENUM : int 5 3 66 70 96 97 10 36 51 55 ...
## $ ATTORNEY: int 0 1 1 0 1 0 0 0 1 1 ...
## $ CLMSEX : int 0 1 0 0 0 1 0 1 1 0 ...
## $ CLMINSUR: int 1 0 1 1 1 1 1 1 1 1 ...
## $ SEATBELT: int 0 0 0 1 0 0 0 0 0 0 ...
## $ CLMAGE : int 50 18 5 31 30 35 9 34 60 NA ...
## $ LOSS : num 34.94 0.891 0.33 0.037 0.038 ...
str(ATTORNEY)
## int [1:1340] 0 1 1 0 1 0 0 0 1 1 ...
str(CLMSEX)
## int [1:1340] 0 1 0 0 0 1 0 1 1 0 ...
str(as.factor(CLMSEX))
## Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 2 1 ...
rm(a)
glm - genralised linear model
m1 <-glm(ATTORNEY ~ factor(CLMSEX)+ factor(CLMINSUR)+factor(SEATBELT) + CLMAGE +LOSS, family = binomial, data=claimants)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m2 <- glm(ATTORNEY ~ ., family=binomial, data = claimants)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
coef(m1)
## (Intercept) factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1
## -0.199977705 0.432996012 0.602173170 -0.781078544
## CLMAGE LOSS
## 0.006487375 -0.385043853
summary(m1)
##
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) +
## factor(SEATBELT) + CLMAGE + LOSS, family = binomial, data = claimants)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.74474 -1.01055 -0.02547 0.95764 2.78320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.199978 0.246769 -0.810 0.41772
## factor(CLMSEX)1 0.432996 0.135706 3.191 0.00142 **
## factor(CLMINSUR)1 0.602173 0.231030 2.606 0.00915 **
## factor(SEATBELT)1 -0.781079 0.566125 -1.380 0.16768
## CLMAGE 0.006487 0.003324 1.952 0.05097 .
## LOSS -0.385044 0.034845 -11.050 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1516.1 on 1095 degrees of freedom
## Residual deviance: 1287.8 on 1090 degrees of freedom
## (244 observations deleted due to missingness)
## AIC: 1299.8
##
## Number of Fisher Scoring iterations: 6
exp(coef(m1)) # odds ratio
## (Intercept) factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1
## 0.8187490 1.5418701 1.8260829 0.4579119
## CLMAGE LOSS
## 1.0065085 0.6804208
prediction
prob <- predict(m1,claimants)
prob <- predict(m1,type=c("response"),claimants)
head(prob)
## 1 2 3 4 5
## 2.970231e-06 5.016792e-01 5.762915e-01 4.521417e-01 6.415700e-01
## 6
## 7.197599e-01
pvprob <- as.data.frame(prob)
final <- cbind(pvprob, claimants)
View(final)
dim(final)
## [1] 1340 8
table(ATTORNEY) #uses the cross-classifying factors to build a contingency table of the counts at each combination of factor levels.
## ATTORNEY
## 0 1
## 685 655
table(prob > 0.5)
##
## FALSE TRUE
## 505 591
confusion <- table(prob >0.5, ATTORNEY)
mode(ATTORNEY)
## [1] "numeric"
table(CLMAGE)
## CLMAGE
## 0 1 3 4 5 6 7 8 9 10 11 13 14 15 16 17 18 19 30 31 33 34 35 36 37
## 45 15 16 13 32 46 41 45 42 61 30 22 16 21 26 22 14 27 45 17 27 26 24 20 23
## 38 39 40 41 43 44 45 46 47 48 49 50 51 53 54 55 56 57 58 59 60 61 63 64 65
## 24 29 55 23 24 18 15 16 16 18 21 30 6 7 14 12 7 7 8 5 12 6 5 5 2
## 66 67 68 69 70 71 73 74 75 76 77 78 80 83 84 86 88 95
## 4 2 2 5 11 3 3 1 1 2 3 3 4 2 1 1 1 1