claimants=read.csv(file.choose())
dim(claimants)
## [1] 1340 7
summary(claimants)
## CASENUM ATTORNEY CLMSEX CLMINSUR
## Min. : 0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4177 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median : 8756 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :11202 Mean :0.4888 Mean :0.5587 Mean :0.9076
## 3rd Qu.:15702 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :34153 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :12 NA's :41
## SEATBELT CLMAGE LOSS
## Min. :0.00000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 9.00 1st Qu.: 0.400
## Median :0.00000 Median :30.00 Median : 1.069
## Mean :0.01703 Mean :28.41 Mean : 3.806
## 3rd Qu.:0.00000 3rd Qu.:43.00 3rd Qu.: 3.781
## Max. :1.00000 Max. :95.00 Max. :173.604
## NA's :48 NA's :189
str(claimants)
## 'data.frame': 1340 obs. of 7 variables:
## $ CASENUM : int 5 3 66 70 96 97 10 36 51 55 ...
## $ ATTORNEY: int 0 1 1 0 1 0 0 0 1 1 ...
## $ CLMSEX : int 0 1 0 0 0 1 0 1 1 0 ...
## $ CLMINSUR: int 1 0 1 1 1 1 1 1 1 1 ...
## $ SEATBELT: int 0 0 0 1 0 0 0 0 0 0 ...
## $ CLMAGE : int 50 18 5 31 30 35 9 34 60 NA ...
## $ LOSS : num 34.94 0.891 0.33 0.037 0.038 ...
colnames(claimants)
## [1] "CASENUM" "ATTORNEY" "CLMSEX" "CLMINSUR" "SEATBELT" "CLMAGE" "LOSS"
attach(claimants)
#logistic regression model
m1=glm(ATTORNEY~factor(CLMSEX)+factor(CLMINSUR)+factor(SEATBELT)+CLMAGE+LOSS, data=claimants)
summary(m1)
##
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) +
## factor(SEATBELT) + CLMAGE + LOSS, data = claimants)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5788 -0.4742 -0.2109 0.4690 1.2620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3293593 0.0527899 6.239 6.29e-10 ***
## factor(CLMSEX)1 0.0862542 0.0295279 2.921 0.00356 **
## factor(CLMINSUR)1 0.1425363 0.0497813 2.863 0.00427 **
## factor(SEATBELT)1 -0.1180436 0.1101692 -1.071 0.28419
## CLMAGE 0.0002997 0.0007194 0.417 0.67706
## LOSS -0.0105640 0.0014112 -7.486 1.46e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2337291)
##
## Null deviance: 273.18 on 1095 degrees of freedom
## Residual deviance: 254.76 on 1090 degrees of freedom
## (244 observations deleted due to missingness)
## AIC: 1525.2
##
## Number of Fisher Scoring iterations: 2
m2=glm(ATTORNEY~factor(CLMSEX)+factor(CLMINSUR)+CLMAGE+LOSS, family = binomial, data =claimants)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m2)
##
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) +
## CLMAGE + LOSS, family = binomial, data = claimants)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.75165 -1.01231 -0.00516 0.96103 2.80542
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.181903 0.242097 -0.751 0.45243
## factor(CLMSEX)1 0.418815 0.134478 3.114 0.00184 **
## factor(CLMINSUR)1 0.564177 0.226708 2.489 0.01283 *
## CLMAGE 0.007197 0.003302 2.179 0.02930 *
## LOSS -0.390260 0.034886 -11.187 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1542.6 on 1114 degrees of freedom
## Residual deviance: 1312.8 on 1110 degrees of freedom
## (225 observations deleted due to missingness)
## AIC: 1322.8
##
## Number of Fisher Scoring iterations: 6
#coefficients
coef(m1)
## (Intercept) factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1
## 0.3293592999 0.0862541647 0.1425363106 -0.1180436455
## CLMAGE LOSS
## 0.0002997072 -0.0105640322
#Prediction
pv=predict(m2,claimants)
#table
table(ATTORNEY)
## ATTORNEY
## 0 1
## 685 655
table(pv>0.5)
##
## FALSE TRUE
## 758 357
#confusion matrix
confusion=table(ATTORNEY,pv>0.5)
confusion
##
## ATTORNEY FALSE TRUE
## 0 492 95
## 1 266 262
#Accuracy => TP/total
ACCURACY= sum(diag(confusion))/sum(confusion)
ACCURACY
## [1] 0.6762332
#precision => TP/(TP+FP)
PRECISION= confusion[2,2]/sum(confusion[2,1:2])
PRECISION
## [1] 0.4962121
#recall => TP/(TP+FN)
RECALL= confusion[2,2]/sum(confusion[1:2,2])
RECALL
## [1] 0.7338936