Jabong Apparels Data Analysis

Sameer Mathur

Logistic Regression Using Caret Package

IMPORTING DATA

(READING AND PREPARING DATA)

Importing Data

library(data.table)
# reading data
apparels10 <- fread("apparels10V2.csv")
attach(apparels10)
# dimension of the dataset
dim(apparels10)
[1] 154793     46

Data Structure

# structure of the data table
str(apparels10)
Classes 'data.table' and 'data.frame':  154793 obs. of  46 variables:
 $ V1                  : int  1 2 3 4 5 6 7 8 9 10 ...
 $ OrderItemCode       : int  49090394 49008396 48993132 49009894 48021310 48163366 48080866 48089426 48168922 48170338 ...
 $ OrderID             : int  28573214 28527434 28518702 28528360 27951446 28033958 27986090 27991226 28037062 28037862 ...
 $ OrderDate           : chr  "04-12-15" "04-12-15" "04-12-15" "04-12-15" ...
 $ Weekend             : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ Category            : chr  "Apparels" "Apparels" "Apparels" "Apparels" ...
 $ SubCategoryOld      : chr  "DRESSES" "DRESSES" "DRESSES" "DRESSES" ...
 $ SubCategory         : chr  "DRESSES" "DRESSES" "DRESSES" "DRESSES" ...
 $ Brand               : chr  "SHAKUMBHARI" "SHAKUMBHARI" "SHAKUMBHARI" "SHAKUMBHARI" ...
 $ MRP                 : int  995 995 995 1695 1095 1195 1095 995 995 1395 ...
 $ VendorDiscount      : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ WebsiteDiscount     : chr  "No" "Yes" "No" "No" ...
 $ FinalTotalPrice     : num  676 645 627 1068 739 ...
 $ ProductColor        : chr  "BLACK" "MULTI" "MULTI" "BEIGE" ...
 $ ProductSize         : chr  "XL" "M" "XL" "M" ...
 $ VATPercent          : num  5 5 5 5 5 5 5 5 5 5 ...
 $ VAT                 : num  29.9 28.4 29.9 50.9 32.9 ...
 $ CODCharge           : int  49 49 0 0 49 49 49 49 49 0 ...
 $ VendorDiscountAmt   : int  398 398 398 678 438 478 438 398 398 907 ...
 $ WebsiteDiscountCode : chr  "" "APP05" "" "" ...
 $ WebsiteDiscountAmt  : num  0 29.9 0 0 0 ...
 $ CustomerID          : int  15618280 10018851 15603874 9286408 15420268 15397246 14319998 15435426 15391266 9655422 ...
 $ COD                 : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ ShippingName        : chr  "Kim Louis" "MANISHA SINGH" "Salina Dcosta" "Arun" ...
 $ ShippingCity        : chr  "Pune" "Noida" "South Goa" "Shimla" ...
 $ ShippingState       : chr  "MH" "UP" "GA" "HP" ...
 $ Metro               : chr  "Yes" "No" "No" "No" ...
 $ ShippingPincode     : int  412307 201308 403725 171001 799001 400097 602105 600026 401208 560048 ...
 $ ShippingAddressType : chr  "Home" "null" "null" "null" ...
 $ ShippingAddressPhone: chr  "9.20E+11" "9.20E+11" "9.20E+11" "9.20E+11" ...
 $ BillingName         : chr  "Kim Louis" "MANISHA  SINGH" "Salina Dcosta" "Arun" ...
 $ BillingCity         : chr  "Pune" "Noida" "South Goa" "Shimla" ...
 $ BillingState        : chr  "MH" "UP" "GA" "HP" ...
 $ BillingPincode      : int  412307 201308 403725 171001 799001 400097 602105 600026 401208 560048 ...
 $ BillingAddressType  : chr  "Home" "null" "null" "null" ...
 $ ItemSKUCode         :integer64 3000227995 3000886641 3000886643 3000927083 3000171753 3000838718 3000171751 3000886643 ... 
 $ VoucherCode         : chr  "" "APP05" "" "" ...
 $ SaleOrderStatus     : chr  "COMPLETE" "COMPLETE" "COMPLETE" "COMPLETE" ...
 $ SaleOrderItemStatus : chr  "DELIVERED" "DISPATCHED" "DELIVERED" "DISPATCHED" ...
 $ ShippingProvider    : chr  "DELHIVERY" "JAVAS" "DELHIVERY" "JAVAS" ...
 $ Dispatch Date       : chr  "4/14/15 18:30" "4/16/15 18:30" "4/14/15 20:20" "4/16/15 17:52" ...
 $ ReturnDate          : logi  NA NA NA NA NA NA ...
 $ ReceiveDate         : logi  NA NA NA NA NA NA ...
 $ BundlesDiscount     : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Payment Gateway     : chr  "null" "null" "null" "null" ...
 $ ActualPayment_Mode  : chr  "COD" "COD" "COD" "COD" ...
 - attr(*, ".internal.selfref")=<externalptr> 

Convert Data Type

# convert 'Weekend' as a factor
apparels10[, Weekend := as.factor(Weekend)]

# convert 'SubCategory' as a factor
apparels10[,SubCategory := as.factor(SubCategory)]

# convert 'SubCategoryOld' as a factor
apparels10[, SubCategoryOld := as.factor(SubCategoryOld)]

# convert 'Brand' as a factor
apparels10[, Brand := as.factor(Brand)]

# convert 'VendorDiscount' as a factor
apparels10[, VendorDiscount := as.factor(VendorDiscount)]

# convert 'WebsiteDiscount' as a factor
apparels10[, WebsiteDiscount := as.factor(WebsiteDiscount)]

# convert 'COD' as a factor
apparels10[, COD := as.factor(COD)]


# convert 'Metro' as a factor
apparels10[, Metro := as.factor(Metro)]

Verifying Data Type Conversion

 Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
 Factor w/ 8 levels "DRESSES","KURTAS",..: 1 1 1 1 1 1 1 1 1 1 ...
 Factor w/ 59 levels "109•À_F","AABOLI",..: 48 48 48 48 48 48 48 48 48 48 ...
 Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
 Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
 Factor w/ 2 levels "No","Yes": 2 1 1 1 1 2 2 2 1 1 ...

Descriptive Statistics

# descriptive statistics
library(psych)
describe(apparels10)[c(5,8:13,23,27),c(2:5,8:9)]
                      n    mean     sd median    min   max
Weekend*         154793    1.32   0.47      1   1.00     2
SubCategory*     154793    4.97   2.38      5   1.00     8
Brand*           154793   27.92  16.01     26   1.00    59
MRP              154793 1160.04 730.91    999 249.00 17498
VendorDiscount*  154793    1.65   0.48      2   1.00     2
WebsiteDiscount* 154793    1.21   0.41      1   1.00     2
FinalTotalPrice  154793  745.03 390.55    650 100.13  8624
COD*             154793    1.61   0.49      2   1.00     2
Metro*           154793    1.35   0.48      1   1.00     2

Creating Train and Test dataset

Reserve 80% for training and 20% of test

# loading the package
library(caTools)
# fixing the observations 
set.seed(123)
# splitting the data 
split = sample.split(apparels10$COD, SplitRatio = 0.75)
# creating the training set
trainingSet = subset(apparels10, split == TRUE)
# creating the test set
testSet = subset(apparels10, split == FALSE)

Verifying the Trainig set and Test Set

# dimensions of the full data
dim(apparels10)
[1] 154793     46
# dimensions of the training data
dim(trainingSet)
[1] 116095     46
# dimensions of the Testing data
dim(testSet)
[1] 38698    46

Explaning Proportions

# proportion of COD in full data
round(prop.table(table(apparels10$COD))*100,4)

     No     Yes 
39.1381 60.8619 
# proportion of COD in train data
round(prop.table(table(trainingSet$COD))*100,4)

     No     Yes 
39.1378 60.8622 
# proportion of COD in test data
round(prop.table(table(testSet$COD))*100,4)

    No    Yes 
39.139 60.861 

MODEL BUILDING USING CARET PACKAGE – BINOMIAL LOGIT CLASSIFIER

Control Parameters

library(caret)
# control parameters
objControl <- trainControl(method = "boot", 
                           number = 2, 
                           returnResamp = 'none', 
                           summaryFunction = twoClassSummary, 
                           classProbs = TRUE,
                           savePredictions = TRUE)

Model S0

set.seed(766)
# model building using caret package
S0 <- train(COD ~   WebsiteDiscount 
                    + VendorDiscount, 
                    data = trainingSet,
                    method = 'glmStepAIC',
                    trControl = objControl,
                    metric = "ROC",verbose = FALSE)
S0$finalModel

Summary


Call:
NULL

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4832  -1.4043   0.8998   0.9664   1.2237  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.69521    0.01225   56.75   <2e-16 ***
WebsiteDiscountYes -0.62720    0.01546  -40.56   <2e-16 ***
VendorDiscountYes  -0.17613    0.01361  -12.94   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 155419  on 116094  degrees of freedom
Residual deviance: 153769  on 116092  degrees of freedom
AIC: 153775

Number of Fisher Scoring iterations: 4

Pr(COD) from Model S0

  WD  VD  Pr(COD)
A No  No  66.7   
B No  Yes 62.7   
C Yes No  51.7   
D Yes Yes 47.3   

Model S1 - Interaction

set.seed(766)
# model building using caret package
S1 <- train(COD ~   WebsiteDiscount * VendorDiscount 
                    , 
                    data = trainingSet,
                    method = 'glmStepAIC',
                    trControl = objControl,
                    metric = "ROC",verbose = FALSE)

Summary


Call:
NULL

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4625  -1.4114   0.9170   0.9602   1.2755  

Coefficients:
                                       Estimate Std. Error z value
(Intercept)                             0.64901    0.01352  47.993
WebsiteDiscountYes                     -0.51957    0.02069 -25.107
VendorDiscountYes                      -0.11390    0.01571  -7.248
`WebsiteDiscountYes:VendorDiscountYes` -0.24321    0.03117  -7.802
                                       Pr(>|z|)    
(Intercept)                             < 2e-16 ***
WebsiteDiscountYes                      < 2e-16 ***
VendorDiscountYes                      4.22e-13 ***
`WebsiteDiscountYes:VendorDiscountYes` 6.09e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 155419  on 116094  degrees of freedom
Residual deviance: 153708  on 116091  degrees of freedom
AIC: 153716

Number of Fisher Scoring iterations: 4

Pr(COD) from Model S1

  WD  VD  Pr(COD)
A No  No  65.7   
B No  Yes 63.1   
C Yes No  53.2   
D Yes Yes 44.3   

Pr(COD) from Model S0

  WD  VD  Pr(COD)
A No  No  66.7   
B No  Yes 62.7   
C Yes No  51.7   
D Yes Yes 47.3   

Pr(COD) from Model S1

  WD  VD  Pr(COD)
A No  No  65.7   
B No  Yes 63.1   
C Yes No  53.2   
D Yes Yes 44.3   

Model M0

Fitting Binomial Logit Regression Model

set.seed(766)
# model building using caret package
M0 <- train(COD ~ MRP 
                    + WebsiteDiscount 
                    + VendorDiscount 
                    + Weekend 
                    + Metro 
                    + SubCategory, 
                    data = trainingSet,
                    method = 'glmStepAIC',
                    trControl = objControl,
                    metric = "ROC",verbose = FALSE)

Model M1

Best AIC Model

# summary of the model
M1 <- M0$finalModel
summary(M1)

Call:
NULL

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0148  -1.3041   0.8646   0.9890   1.4198  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)            9.283e-01  2.828e-02  32.822  < 2e-16 ***
MRP                    7.360e-05  1.063e-05   6.925 4.37e-12 ***
WebsiteDiscountYes    -6.128e-01  1.578e-02 -38.822  < 2e-16 ***
VendorDiscountYes     -2.382e-01  1.482e-02 -16.076  < 2e-16 ***
WeekendYes            -1.032e-01  1.300e-02  -7.945 1.95e-15 ***
MetroYes              -2.322e-01  1.272e-02 -18.257  < 2e-16 ***
SubCategoryKURTAS     -2.834e-01  2.620e-02 -10.814  < 2e-16 ***
SubCategoryKURTIS     -2.363e-01  2.839e-02  -8.324  < 2e-16 ***
SubCategoryPANTS      -1.387e-01  3.271e-02  -4.241 2.22e-05 ***
SubCategorySAREES      1.431e-01  2.674e-02   5.351 8.76e-08 ***
SubCategorySHIRTS     -3.545e-01  2.594e-02 -13.665  < 2e-16 ***
`SubCategoryT-SHIRTS` -1.406e-01  2.569e-02  -5.473 4.43e-08 ***
SubCategoryTOPS       -2.385e-01  2.564e-02  -9.301  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 155419  on 116094  degrees of freedom
Residual deviance: 152559  on 116082  degrees of freedom
AIC: 152585

Number of Fisher Scoring iterations: 4

Model M2

Fitting Binomial Logit Regression Model with Interaction

set.seed(766)
# model building using caret package
M2 <- train(COD ~ MRP * WebsiteDiscount * VendorDiscount
                    + Weekend * MRP
                    + Metro * MRP
                    + SubCategory, 
                    data = trainingSet,
                    method = 'glmStepAIC',
                    trControl = objControl,
                    metric = "ROC",verbose = FALSE)

Model M3

Best AIC Model with Interaction

# summary of the model
M3 <- M2$finalModel
summary(M3)

Call:
NULL

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2249  -1.3144   0.8759   0.9908   1.7071  

Coefficients:
                                             Estimate Std. Error z value
(Intercept)                                 6.277e-01  4.375e-02  14.349
MRP                                         3.752e-04  4.396e-05   8.535
WebsiteDiscountYes                         -7.075e-01  5.542e-02 -12.766
VendorDiscountYes                           5.851e-02  3.760e-02   1.556
WeekendYes                                 -9.664e-02  1.306e-02  -7.402
MetroYes                                   -1.932e-01  2.459e-02  -7.857
SubCategoryKURTAS                          -2.564e-01  2.637e-02  -9.721
SubCategoryKURTIS                          -1.822e-01  2.875e-02  -6.337
SubCategoryPANTS                           -1.213e-01  3.285e-02  -3.692
SubCategorySAREES                           1.848e-01  2.699e-02   6.849
SubCategorySHIRTS                          -3.432e-01  2.603e-02 -13.187
`SubCategoryT-SHIRTS`                      -7.389e-02  2.629e-02  -2.811
SubCategoryTOPS                            -2.254e-01  2.571e-02  -8.769
`MRP:WebsiteDiscountYes`                    1.007e-04  5.886e-05   1.711
`MRP:VendorDiscountYes`                    -3.108e-04  4.414e-05  -7.042
`WebsiteDiscountYes:VendorDiscountYes`      2.385e-01  7.347e-02   3.246
`MRP:MetroYes`                             -3.478e-05  1.872e-05  -1.858
`MRP:WebsiteDiscountYes:VendorDiscountYes` -3.012e-04  6.776e-05  -4.445
                                           Pr(>|z|)    
(Intercept)                                 < 2e-16 ***
MRP                                         < 2e-16 ***
WebsiteDiscountYes                          < 2e-16 ***
VendorDiscountYes                          0.119719    
WeekendYes                                 1.35e-13 ***
MetroYes                                   3.93e-15 ***
SubCategoryKURTAS                           < 2e-16 ***
SubCategoryKURTIS                          2.34e-10 ***
SubCategoryPANTS                           0.000222 ***
SubCategorySAREES                          7.45e-12 ***
SubCategorySHIRTS                           < 2e-16 ***
`SubCategoryT-SHIRTS`                      0.004942 ** 
SubCategoryTOPS                             < 2e-16 ***
`MRP:WebsiteDiscountYes`                   0.087111 .  
`MRP:VendorDiscountYes`                    1.90e-12 ***
`WebsiteDiscountYes:VendorDiscountYes`     0.001169 ** 
`MRP:MetroYes`                             0.063222 .  
`MRP:WebsiteDiscountYes:VendorDiscountYes` 8.78e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 155419  on 116094  degrees of freedom
Residual deviance: 152329  on 116077  degrees of freedom
AIC: 152365

Number of Fisher Scoring iterations: 4

CLASSIFICATION

Visualization of Predicted Probabilities

# predicted probabilities
predProbTest <- predict(M2, testSet[,c(5,8,10:12,27)], type = "prob")

# plot of probabilities
plot(predProbTest$Yes, 
     main = "Scatterplot of Probabilities of COD (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of COD")

plot of chunk unnamed-chunk-36

Summary Statistics of Predicted Probabilities

# minimum value of predicted probability
min(predProbTest$Yes)
[1] 0.1213258
# maximum value of predicted probability
max(predProbTest$Yes)
[1] 0.982933
# mean value of predicted probability
mean(predProbTest$Yes)
[1] 0.6085065

Confusion Matrix

Model M0 (Without Interaction)

Classification (assuming threshold Prob = 50%)

library(caret)
predProbTestM0 <- predict(M0, testSet[,c(5,8,10:12,27)], type = "prob")
# confusion matrix using caret package
yPredM0 <- ifelse(predProbTestM0$Yes > 0.5, "Yes", "No")
predYM0 <- as.factor(yPredM0)
confusionMatrix(data = predYM0, reference = testSet$COD, positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction    No   Yes
       No   2118  1898
       Yes 13028 21654

               Accuracy : 0.6143          
                 95% CI : (0.6094, 0.6191)
    No Information Rate : 0.6086          
    P-Value [Acc > NIR] : 0.01107         

                  Kappa : 0.0682          
 Mcnemar's Test P-Value : < 2e-16         

            Sensitivity : 0.9194          
            Specificity : 0.1398          
         Pos Pred Value : 0.6244          
         Neg Pred Value : 0.5274          
             Prevalence : 0.6086          
         Detection Rate : 0.5596          
   Detection Prevalence : 0.8962          
      Balanced Accuracy : 0.5296          

       'Positive' Class : Yes             

Model M2 (With Interaction)

Classification (assuming threshold Prob = 50%)

library(caret)
predProbTestM2 <- predict(M2, testSet[,c(5,8,10:12,27)], type = "prob")
# confusion matrix using caret package
yPredM2 <- ifelse(predProbTestM2$Yes > 0.5, "Yes", "No")
predYM2 <- as.factor(yPredM2)
confusionMatrix(data = predYM2, reference = testSet$COD, positive = "Yes")
Confusion Matrix and Statistics

          Reference
Prediction    No   Yes
       No   2249  1935
       Yes 12897 21617

               Accuracy : 0.6167          
                 95% CI : (0.6119, 0.6216)
    No Information Rate : 0.6086          
    P-Value [Acc > NIR] : 0.0005397       

                  Kappa : 0.0762          
 Mcnemar's Test P-Value : < 2.2e-16       

            Sensitivity : 0.9178          
            Specificity : 0.1485          
         Pos Pred Value : 0.6263          
         Neg Pred Value : 0.5375          
             Prevalence : 0.6086          
         Detection Rate : 0.5586          
   Detection Prevalence : 0.8919          
      Balanced Accuracy : 0.5332          

       'Positive' Class : Yes             

Comparing Models with & without Interactions

without interaction

Confusion Matrix and Statistics

          Reference
Prediction    No   Yes
       No   2118  1898
       Yes 13028 21654

               Accuracy : 0.6143          
                 95% CI : (0.6094, 0.6191)
    No Information Rate : 0.6086          
    P-Value [Acc > NIR] : 0.01107         

                  Kappa : 0.0682          
 Mcnemar's Test P-Value : < 2e-16         

            Sensitivity : 0.9194          
            Specificity : 0.1398          
         Pos Pred Value : 0.6244          
         Neg Pred Value : 0.5274          
             Prevalence : 0.6086          
         Detection Rate : 0.5596          
   Detection Prevalence : 0.8962          
      Balanced Accuracy : 0.5296          

       'Positive' Class : Yes             

with interaction

Confusion Matrix and Statistics

          Reference
Prediction    No   Yes
       No   2249  1935
       Yes 12897 21617

               Accuracy : 0.6167          
                 95% CI : (0.6119, 0.6216)
    No Information Rate : 0.6086          
    P-Value [Acc > NIR] : 0.0005397       

                  Kappa : 0.0762          
 Mcnemar's Test P-Value : < 2.2e-16       

            Sensitivity : 0.9178          
            Specificity : 0.1485          
         Pos Pred Value : 0.6263          
         Neg Pred Value : 0.5375          
             Prevalence : 0.6086          
         Detection Rate : 0.5586          
   Detection Prevalence : 0.8919          
      Balanced Accuracy : 0.5332          

       'Positive' Class : Yes             

Comparing AIC

Model M0 without interaction

# extracting AIC
M0$finalModel$aic
[1] 152585

Model M2 with interaction

# extracting AIC
M2$finalModel$aic
[1] 152364.5