library('pscl')
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('lmtest')
## Warning: package 'lmtest' was built under R version 3.4.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
library('lmtest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded

R Markdown

Steps involved

1. Import the data

2. Build Logit models and predict on test data

3. Do model diagnostics

Import the data

This analysis shows the regression analysis and AIC value monitoring for optimal modelling..

salesdata <- read.csv("nm-logit1.csv")

Run the logistic regression analysis…McFadden pseudo r-squared R2 value is 53% which is quite good for model evalutaion.

Here then i have compared two model by lrtest. Difference between two model loglikelyhood ratio is 3.0649 and multiplied this by 2 we get the Chi-Square value 6.1297.. The p-value is 0.01329 and this is significant..This means two model are not same..This also says that Logistic regression method is valid for this problem. Here we take the model mysalesglm1 as this has better Macfaden R2…

summary(salesdata)
##        no           Loyalty        Brand        Product     Shopping    
##  Min.   : 1.00   Min.   :0.0   Min.   :1.0   Min.   :1   Min.   :2.000  
##  1st Qu.: 8.25   1st Qu.:0.0   1st Qu.:3.0   1st Qu.:3   1st Qu.:2.250  
##  Median :15.50   Median :0.5   Median :5.0   Median :4   Median :4.000  
##  Mean   :15.50   Mean   :0.5   Mean   :4.6   Mean   :4   Mean   :3.533  
##  3rd Qu.:22.75   3rd Qu.:1.0   3rd Qu.:6.0   3rd Qu.:5   3rd Qu.:4.000  
##  Max.   :30.00   Max.   :1.0   Max.   :7.0   Max.   :7   Max.   :6.000
smp_size <- floor(0.75 * nrow(salesdata))
set.seed(123)
train_ind  <- sample(seq_len(nrow(salesdata)), size = smp_size)
train_data <- salesdata[train_ind, ]
test_data <- salesdata[-train_ind, ]

mysalesglm1 <- glm(test_data$Loyalty ~ test_data$Brand + test_data$Product , data=train_data, family = binomial)  ## Here the AIC value is 12.13 and residual deviance is 6.13
summary(mysalesglm1)
## 
## Call:
## glm(formula = test_data$Loyalty ~ test_data$Brand + test_data$Product, 
##     family = binomial, data = train_data)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6        7        8  
##  0.3577   0.1776   1.8055   0.4860  -0.9927  -1.0498  -0.2468  -0.5708  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)        -5.3639     4.5871  -1.169    0.242
## test_data$Brand     1.7443     1.2191   1.431    0.152
## test_data$Product  -0.3202     0.5462  -0.586    0.558
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11.0904  on 7  degrees of freedom
## Residual deviance:  6.1297  on 5  degrees of freedom
## AIC: 12.13
## 
## Number of Fisher Scoring iterations: 5
mysalesglm <- glm(test_data$Loyalty ~ test_data$Brand + test_data$Product + test_data$Shopping , data=train_data, family = binomial) # here the AIC value is 8 and Residual Deviance is 2.385e-10
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mysalesglm)
## 
## Call:
## glm(formula = test_data$Loyalty ~ test_data$Brand + test_data$Product + 
##     test_data$Shopping, family = binomial, data = train_data)
## 
## Deviance Residuals: 
##          1           2           3           4           5           6  
##  2.110e-08   2.110e-08   5.885e-06   9.154e-06  -5.857e-06  -9.259e-06  
##          7           8  
## -2.110e-08  -2.110e-08  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)          -235.957 597827.526       0        1
## test_data$Brand        42.576 102499.579       0        1
## test_data$Product      -2.596  36581.894       0        1
## test_data$Shopping     28.679  78310.492       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.1090e+01  on 7  degrees of freedom
## Residual deviance: 2.3845e-10  on 4  degrees of freedom
## AIC: 8
## 
## Number of Fisher Scoring iterations: 25
lrtest(mysalesglm,mysalesglm1)  ##lrtest is a generic function for carrying out likelihood ratio tests
## Likelihood ratio test
## 
## Model 1: test_data$Loyalty ~ test_data$Brand + test_data$Product + test_data$Shopping
## Model 2: test_data$Loyalty ~ test_data$Brand + test_data$Product
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1   4  0.0000                       
## 2   3 -3.0649 -1 6.1297    0.01329 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(mysalesglm)  ##lrtest is a generic function for carrying out likelihood ratio tests
## Likelihood ratio test
## 
## Model 1: test_data$Loyalty ~ test_data$Brand + test_data$Product + test_data$Shopping
## Model 2: test_data$Loyalty ~ 1
##   #Df  LogLik Df Chisq Pr(>Chisq)  
## 1   4  0.0000                      
## 2   1 -5.5452 -3 11.09    0.01125 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
corrplot(cor(salesdata), method="circle", type="full", addCoef.col = "blue", order ="AOE", bg ='grey')

pR2(mysalesglm1)
##        llh    llhNull         G2   McFadden       r2ML       r2CU 
## -3.0648628 -5.5451774  4.9606293  0.4472922  0.4620979  0.6161305

Lets test the regression model performance and confusion matrix..Performance accuracy is 0.875%

test_data$LoyaltyPredicted <- predict(mysalesglm1,newdata=test_data,type = 'response')

test_data$LoyaltyPredictedRev  <- ifelse(test_data$LoyaltyPredicted > 0.5,1,0)
salesdataRev <- table(actualclass=test_data$Loyalty, predictedclass=test_data$LoyaltyPredictedRev)

confusionMatrix(salesdataRev) # generating the confusion matrix
## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass 0 1
##           0 4 0
##           1 1 3
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.4735, 0.9968)
##     No Information Rate : 0.625           
##     P-Value [Acc > NIR] : 0.135           
##                                           
##                   Kappa : 0.75            
##  Mcnemar's Test P-Value : 1.000           
##                                           
##             Sensitivity : 0.800           
##             Specificity : 1.000           
##          Pos Pred Value : 1.000           
##          Neg Pred Value : 0.750           
##              Prevalence : 0.625           
##          Detection Rate : 0.500           
##    Detection Prevalence : 0.500           
##       Balanced Accuracy : 0.900           
##                                           
##        'Positive' Class : 0               
## 

Now lets evaluate the model performance with area under ROC curve..The ROC is a curve generated by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings while the AUC is the area under the ROC curve. As a rule of thumb, a model with good predictive ability should have an AUC closer to 1 (1 is ideal) than to 0.5.

pr <- prediction(test_data$LoyaltyPredicted, test_data$Loyalty)
                 
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")

auc <- auc@y.values[[1]]
auc
## [1] 0.875