library('pscl')
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('lmtest')
## Warning: package 'lmtest' was built under R version 3.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1

R Markdown

Steps involved

1. Import the data

2. Build Logit models and predict on test data

3. Do model diagnostics

Import the data

salesdata <- read.csv("salesdata.csv")

Run the logistic regression analysis…McFadden pseudo r-squared R2 value is 53% which is quite good for model evalutaion.

summary(salesdata)
##      Income          Lot.Size         Ownership 
##  Min.   : 33.00   Min.   :14.00   non-owner:12  
##  1st Qu.: 52.35   1st Qu.:17.50   owner    :12  
##  Median : 64.80   Median :19.00                 
##  Mean   : 68.44   Mean   :18.95                 
##  3rd Qu.: 83.10   3rd Qu.:20.80                 
##  Max.   :110.10   Max.   :23.60
mysalesglm <- glm(Ownership ~ Income + Lot.Size  , data=salesdata, family = binomial)

lrtest(mysalesglm)
## Likelihood ratio test
## 
## Model 1: Ownership ~ Income + Lot.Size
## Model 2: Ownership ~ 1
##   #Df   LogLik Df  Chisq Pr(>Chisq)    
## 1   3  -7.6616                         
## 2   1 -16.6355 -2 17.948  0.0001267 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pR2(mysalesglm)
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
##  -7.6616403 -16.6355323  17.9477841   0.5394412   0.5266046   0.7021395

Lets test the regression model performance and confusion matrix..Performance accuracy is 83.33%

salesdata$Ownershippredicted <- predict(mysalesglm,newdata=salesdata,type = 'response')
salesdata$OwnershippredictedRev  <- ifelse(salesdata$Ownershippredicted > 0.5,"owner","non-owner")
salesdataRev <- table(actualclass=salesdata$Ownership, predictedclass=salesdata$OwnershippredictedRev)
confusionMatrix(salesdataRev) # generating the confusion matrix
## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass non-owner owner
##   non-owner        10     2
##   owner             2    10
##                                           
##                Accuracy : 0.8333          
##                  95% CI : (0.6262, 0.9526)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0007719       
##                                           
##                   Kappa : 0.6667          
##  Mcnemar's Test P-Value : 1.0000000       
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.8333          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4167          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8333          
##                                           
##        'Positive' Class : non-owner       
## 

Now Lets evaluate the model performance after increase the cut of probability to 0.6..Accuracy gets increases to 87.5%. Key observations are ..Non-Owner predictions get increased –Owner Predictions remained same –Owner misclassification gets reduced

salesdata$OwnershippredictedRev1  <- ifelse(salesdata$Ownershippredicted > 0.6,"owner","non-owner")
salesdataConf <- table(actualclass=salesdata$Ownership, predictedclass=salesdata$OwnershippredictedRev1)
confusionMatrix(salesdataConf) # generating the confusion matrix
## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass non-owner owner
##   non-owner        11     1
##   owner             2    10
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.6764, 0.9734)
##     No Information Rate : 0.5417          
##     P-Value [Acc > NIR] : 0.0005881       
##                                           
##                   Kappa : 0.75            
##  Mcnemar's Test P-Value : 1.0000000       
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.9091          
##          Pos Pred Value : 0.9167          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.5417          
##          Detection Rate : 0.4583          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8776          
##                                           
##        'Positive' Class : non-owner       
## 

Now lets evaluate the model performance with area under ROC curve..The ROC is a curve generated by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings while the AUC is the area under the ROC curve. As a rule of thumb, a model with good predictive ability should have an AUC closer to 1 (1 is ideal) than to 0.5.

pr <- prediction(salesdata$Ownershippredicted, salesdata$Ownership)
                 
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")

auc <- auc@y.values[[1]]
auc
## [1] 0.9236111