library('pscl')
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('lmtest')
## Warning: package 'lmtest' was built under R version 3.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
salesdata <- read.csv("salesdata.csv")
Run the logistic regression analysis…McFadden pseudo r-squared R2 value is 53% which is quite good for model evalutaion.
summary(salesdata)
## Income Lot.Size Ownership
## Min. : 33.00 Min. :14.00 non-owner:12
## 1st Qu.: 52.35 1st Qu.:17.50 owner :12
## Median : 64.80 Median :19.00
## Mean : 68.44 Mean :18.95
## 3rd Qu.: 83.10 3rd Qu.:20.80
## Max. :110.10 Max. :23.60
mysalesglm <- glm(Ownership ~ Income + Lot.Size , data=salesdata, family = binomial)
lrtest(mysalesglm)
## Likelihood ratio test
##
## Model 1: Ownership ~ Income + Lot.Size
## Model 2: Ownership ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 3 -7.6616
## 2 1 -16.6355 -2 17.948 0.0001267 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pR2(mysalesglm)
## llh llhNull G2 McFadden r2ML r2CU
## -7.6616403 -16.6355323 17.9477841 0.5394412 0.5266046 0.7021395
Lets test the regression model performance and confusion matrix..Performance accuracy is 83.33%
salesdata$Ownershippredicted <- predict(mysalesglm,newdata=salesdata,type = 'response')
salesdata$OwnershippredictedRev <- ifelse(salesdata$Ownershippredicted > 0.5,"owner","non-owner")
salesdataRev <- table(actualclass=salesdata$Ownership, predictedclass=salesdata$OwnershippredictedRev)
confusionMatrix(salesdataRev) # generating the confusion matrix
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass non-owner owner
## non-owner 10 2
## owner 2 10
##
## Accuracy : 0.8333
## 95% CI : (0.6262, 0.9526)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.0007719
##
## Kappa : 0.6667
## Mcnemar's Test P-Value : 1.0000000
##
## Sensitivity : 0.8333
## Specificity : 0.8333
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.8333
## Prevalence : 0.5000
## Detection Rate : 0.4167
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8333
##
## 'Positive' Class : non-owner
##
Now Lets evaluate the model performance after increase the cut of probability to 0.6..Accuracy gets increases to 87.5%. Key observations are ..Non-Owner predictions get increased –Owner Predictions remained same –Owner misclassification gets reduced
salesdata$OwnershippredictedRev1 <- ifelse(salesdata$Ownershippredicted > 0.6,"owner","non-owner")
salesdataConf <- table(actualclass=salesdata$Ownership, predictedclass=salesdata$OwnershippredictedRev1)
confusionMatrix(salesdataConf) # generating the confusion matrix
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass non-owner owner
## non-owner 11 1
## owner 2 10
##
## Accuracy : 0.875
## 95% CI : (0.6764, 0.9734)
## No Information Rate : 0.5417
## P-Value [Acc > NIR] : 0.0005881
##
## Kappa : 0.75
## Mcnemar's Test P-Value : 1.0000000
##
## Sensitivity : 0.8462
## Specificity : 0.9091
## Pos Pred Value : 0.9167
## Neg Pred Value : 0.8333
## Prevalence : 0.5417
## Detection Rate : 0.4583
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8776
##
## 'Positive' Class : non-owner
##
Now lets evaluate the model performance with area under ROC curve..The ROC is a curve generated by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings while the AUC is the area under the ROC curve. As a rule of thumb, a model with good predictive ability should have an AUC closer to 1 (1 is ideal) than to 0.5.
pr <- prediction(salesdata$Ownershippredicted, salesdata$Ownership)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9236111