Implementation
getwd()
## [1] "E:/Excelr DS/R _Codes/Logistic Regression"
claimants<-read.csv("claimants.csv")
claimants <- read.csv(file.choose()) # Choose the claimants Data set
View(claimants)
attach(claimants)
summary(claimants)
## CASENUM ATTORNEY CLMSEX CLMINSUR
## Min. : 0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4177 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median : 8756 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :11202 Mean :0.4888 Mean :0.5587 Mean :0.9076
## 3rd Qu.:15702 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :34153 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :12 NA's :41
## SEATBELT CLMAGE LOSS
## Min. :0.00000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 9.00 1st Qu.: 0.400
## Median :0.00000 Median :30.00 Median : 1.069
## Mean :0.01703 Mean :28.41 Mean : 3.806
## 3rd Qu.:0.00000 3rd Qu.:43.00 3rd Qu.: 3.781
## Max. :1.00000 Max. :95.00 Max. :173.604
## NA's :48 NA's :189
# Linear Regression
fit <- lm(ATTORNEY~factor(CLMSEX)+factor(CLMINSUR)+CLMAGE+LOSS)
summary(fit)
##
## Call:
## lm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + CLMAGE +
## LOSS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5799 -0.4768 -0.1769 0.4693 1.3120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3367496 0.0521232 6.461 1.56e-10 ***
## factor(CLMSEX)1 0.0825409 0.0293292 2.814 0.00497 **
## factor(CLMINSUR)1 0.1326489 0.0491720 2.698 0.00709 **
## CLMAGE 0.0004053 0.0007154 0.567 0.57113
## LOSS -0.0108442 0.0013994 -7.749 2.08e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4839 on 1110 degrees of freedom
## (225 observations deleted due to missingness)
## Multiple R-squared: 0.06478, Adjusted R-squared: 0.06141
## F-statistic: 19.22 on 4 and 1110 DF, p-value: 2.652e-15
# Linear regression technique can not be employed
# Logistic Regression
colnames(claimants)
## [1] "CASENUM" "ATTORNEY" "CLMSEX" "CLMINSUR" "SEATBELT" "CLMAGE"
## [7] "LOSS"
str(as.factor(CLMSEX))
## Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 2 1 ...
logit <- glm(ATTORNEY ~ factor(CLMSEX)+factor(CLMINSUR)+factor(SEATBELT)+CLMAGE+LOSS, family=binomial,data = claimants)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
##
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) +
## factor(SEATBELT) + CLMAGE + LOSS, family = binomial, data = claimants)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.74474 -1.01055 -0.02547 0.95764 2.78320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.199978 0.246769 -0.810 0.41772
## factor(CLMSEX)1 0.432996 0.135706 3.191 0.00142 **
## factor(CLMINSUR)1 0.602173 0.231030 2.606 0.00915 **
## factor(SEATBELT)1 -0.781079 0.566125 -1.380 0.16768
## CLMAGE 0.006487 0.003324 1.952 0.05097 .
## LOSS -0.385044 0.034845 -11.050 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1516.1 on 1095 degrees of freedom
## Residual deviance: 1287.8 on 1090 degrees of freedom
## (244 observations deleted due to missingness)
## AIC: 1299.8
##
## Number of Fisher Scoring iterations: 6
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
x<-stepAIC(logit)
## Start: AIC=1299.85
## ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + factor(SEATBELT) +
## CLMAGE + LOSS
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 1287.8 1299.8
## - factor(SEATBELT) 1 1289.9 1299.9
## - CLMAGE 1 1291.7 1301.7
## - factor(CLMINSUR) 1 1294.8 1304.8
## - factor(CLMSEX) 1 1298.1 1308.1
## - LOSS 1 1494.3 1504.3
vif(logit)
## factor(CLMSEX) factor(CLMINSUR) factor(SEATBELT) CLMAGE
## 1.010314 1.006588 1.002524 1.027312
## LOSS
## 1.028889
# Odds Ratio
exp(coef(logit))
## (Intercept) factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1
## 0.8187490 1.5418701 1.8260829 0.4579119
## CLMAGE LOSS
## 1.0065085 0.6804208
# Confusion matrix table
prob <- predict(logit,type=c("response"),claimants)
prob<-as.data.frame(prob)
final <- cbind(prob,claimants)
confusion <- table(prob > 0.5,claimants$ATTORNEY)
table(prob > 0.5)
##
## FALSE TRUE
## 505 591
confusion
##
## 0 1
## FALSE 380 125
## TRUE 198 393
# Model Accuracy
Accuracy <- sum(diag(confusion)/sum(confusion))
Accuracy
## [1] 0.705292
# ROC Curve
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
rocrpred<-prediction(prob,claimants$ATTORNEY)
rocrperf<-performance(rocrpred,'tpr','fpr')
plot(rocrperf,colorize=T,text.adj=c(-0.2,1.7))

# More area under the ROC Curve better is the logistic regression model obtained