claimants=read.csv(file.choose())
dim(claimants)
## [1] 1340    7
summary(claimants)
##     CASENUM         ATTORNEY          CLMSEX          CLMINSUR     
##  Min.   :    0   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4177   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median : 8756   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :11202   Mean   :0.4888   Mean   :0.5587   Mean   :0.9076  
##  3rd Qu.:15702   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :34153   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                   NA's   :12       NA's   :41      
##     SEATBELT           CLMAGE           LOSS        
##  Min.   :0.00000   Min.   : 0.00   Min.   :  0.000  
##  1st Qu.:0.00000   1st Qu.: 9.00   1st Qu.:  0.400  
##  Median :0.00000   Median :30.00   Median :  1.069  
##  Mean   :0.01703   Mean   :28.41   Mean   :  3.806  
##  3rd Qu.:0.00000   3rd Qu.:43.00   3rd Qu.:  3.781  
##  Max.   :1.00000   Max.   :95.00   Max.   :173.604  
##  NA's   :48        NA's   :189
str(claimants)
## 'data.frame':    1340 obs. of  7 variables:
##  $ CASENUM : int  5 3 66 70 96 97 10 36 51 55 ...
##  $ ATTORNEY: int  0 1 1 0 1 0 0 0 1 1 ...
##  $ CLMSEX  : int  0 1 0 0 0 1 0 1 1 0 ...
##  $ CLMINSUR: int  1 0 1 1 1 1 1 1 1 1 ...
##  $ SEATBELT: int  0 0 0 1 0 0 0 0 0 0 ...
##  $ CLMAGE  : int  50 18 5 31 30 35 9 34 60 NA ...
##  $ LOSS    : num  34.94 0.891 0.33 0.037 0.038 ...
colnames(claimants)
## [1] "CASENUM"  "ATTORNEY" "CLMSEX"   "CLMINSUR" "SEATBELT" "CLMAGE"   "LOSS"
attach(claimants)
#logistic regression model

m1=glm(ATTORNEY~factor(CLMSEX)+factor(CLMINSUR)+factor(SEATBELT)+CLMAGE+LOSS, data=claimants)
summary(m1)
## 
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + 
##     factor(SEATBELT) + CLMAGE + LOSS, data = claimants)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5788  -0.4742  -0.2109   0.4690   1.2620  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.3293593  0.0527899   6.239 6.29e-10 ***
## factor(CLMSEX)1    0.0862542  0.0295279   2.921  0.00356 ** 
## factor(CLMINSUR)1  0.1425363  0.0497813   2.863  0.00427 ** 
## factor(SEATBELT)1 -0.1180436  0.1101692  -1.071  0.28419    
## CLMAGE             0.0002997  0.0007194   0.417  0.67706    
## LOSS              -0.0105640  0.0014112  -7.486 1.46e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2337291)
## 
##     Null deviance: 273.18  on 1095  degrees of freedom
## Residual deviance: 254.76  on 1090  degrees of freedom
##   (244 observations deleted due to missingness)
## AIC: 1525.2
## 
## Number of Fisher Scoring iterations: 2
m2=glm(ATTORNEY~factor(CLMSEX)+factor(CLMINSUR)+CLMAGE+LOSS, family = binomial, data =claimants)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m2)
## 
## Call:
## glm(formula = ATTORNEY ~ factor(CLMSEX) + factor(CLMINSUR) + 
##     CLMAGE + LOSS, family = binomial, data = claimants)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.75165  -1.01231  -0.00516   0.96103   2.80542  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.181903   0.242097  -0.751  0.45243    
## factor(CLMSEX)1    0.418815   0.134478   3.114  0.00184 ** 
## factor(CLMINSUR)1  0.564177   0.226708   2.489  0.01283 *  
## CLMAGE             0.007197   0.003302   2.179  0.02930 *  
## LOSS              -0.390260   0.034886 -11.187  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1542.6  on 1114  degrees of freedom
## Residual deviance: 1312.8  on 1110  degrees of freedom
##   (225 observations deleted due to missingness)
## AIC: 1322.8
## 
## Number of Fisher Scoring iterations: 6
#coefficients
coef(m1)
##       (Intercept)   factor(CLMSEX)1 factor(CLMINSUR)1 factor(SEATBELT)1 
##      0.3293592999      0.0862541647      0.1425363106     -0.1180436455 
##            CLMAGE              LOSS 
##      0.0002997072     -0.0105640322
#Prediction
pv=predict(m2,claimants)

#table
table(ATTORNEY)
## ATTORNEY
##   0   1 
## 685 655
table(pv>0.5)
## 
## FALSE  TRUE 
##   758   357
#confusion matrix

confusion=table(ATTORNEY,pv>0.5)
confusion
##         
## ATTORNEY FALSE TRUE
##        0   492   95
##        1   266  262
#Accuracy => TP/total

ACCURACY= sum(diag(confusion))/sum(confusion)
ACCURACY
## [1] 0.6762332
#precision => TP/(TP+FP)
PRECISION= confusion[2,2]/sum(confusion[2,1:2])
PRECISION
## [1] 0.4962121
#recall => TP/(TP+FN)
RECALL= confusion[2,2]/sum(confusion[1:2,2])
RECALL
## [1] 0.7338936