Sameer Mathur
Logistic Regression Using Caret Package
library(data.table)
# reading data
apparels10 <- fread("apparels10V2.csv")
attach(apparels10)
# dimension of the dataset
dim(apparels10)
[1] 154793 46
# structure of the data table
str(apparels10)
Classes 'data.table' and 'data.frame': 154793 obs. of 46 variables:
$ V1 : int 1 2 3 4 5 6 7 8 9 10 ...
$ OrderItemCode : int 49090394 49008396 48993132 49009894 48021310 48163366 48080866 48089426 48168922 48170338 ...
$ OrderID : int 28573214 28527434 28518702 28528360 27951446 28033958 27986090 27991226 28037062 28037862 ...
$ OrderDate : chr "04-12-15" "04-12-15" "04-12-15" "04-12-15" ...
$ Weekend : chr "Yes" "Yes" "Yes" "Yes" ...
$ Category : chr "Apparels" "Apparels" "Apparels" "Apparels" ...
$ SubCategoryOld : chr "DRESSES" "DRESSES" "DRESSES" "DRESSES" ...
$ SubCategory : chr "DRESSES" "DRESSES" "DRESSES" "DRESSES" ...
$ Brand : chr "SHAKUMBHARI" "SHAKUMBHARI" "SHAKUMBHARI" "SHAKUMBHARI" ...
$ MRP : int 995 995 995 1695 1095 1195 1095 995 995 1395 ...
$ VendorDiscount : chr "Yes" "Yes" "Yes" "Yes" ...
$ WebsiteDiscount : chr "No" "Yes" "No" "No" ...
$ FinalTotalPrice : num 676 645 627 1068 739 ...
$ ProductColor : chr "BLACK" "MULTI" "MULTI" "BEIGE" ...
$ ProductSize : chr "XL" "M" "XL" "M" ...
$ VATPercent : num 5 5 5 5 5 5 5 5 5 5 ...
$ VAT : num 29.9 28.4 29.9 50.9 32.9 ...
$ CODCharge : int 49 49 0 0 49 49 49 49 49 0 ...
$ VendorDiscountAmt : int 398 398 398 678 438 478 438 398 398 907 ...
$ WebsiteDiscountCode : chr "" "APP05" "" "" ...
$ WebsiteDiscountAmt : num 0 29.9 0 0 0 ...
$ CustomerID : int 15618280 10018851 15603874 9286408 15420268 15397246 14319998 15435426 15391266 9655422 ...
$ COD : chr "Yes" "Yes" "Yes" "Yes" ...
$ ShippingName : chr "Kim Louis" "MANISHA SINGH" "Salina Dcosta" "Arun" ...
$ ShippingCity : chr "Pune" "Noida" "South Goa" "Shimla" ...
$ ShippingState : chr "MH" "UP" "GA" "HP" ...
$ Metro : chr "Yes" "No" "No" "No" ...
$ ShippingPincode : int 412307 201308 403725 171001 799001 400097 602105 600026 401208 560048 ...
$ ShippingAddressType : chr "Home" "null" "null" "null" ...
$ ShippingAddressPhone: chr "9.20E+11" "9.20E+11" "9.20E+11" "9.20E+11" ...
$ BillingName : chr "Kim Louis" "MANISHA SINGH" "Salina Dcosta" "Arun" ...
$ BillingCity : chr "Pune" "Noida" "South Goa" "Shimla" ...
$ BillingState : chr "MH" "UP" "GA" "HP" ...
$ BillingPincode : int 412307 201308 403725 171001 799001 400097 602105 600026 401208 560048 ...
$ BillingAddressType : chr "Home" "null" "null" "null" ...
$ ItemSKUCode :integer64 3000227995 3000886641 3000886643 3000927083 3000171753 3000838718 3000171751 3000886643 ...
$ VoucherCode : chr "" "APP05" "" "" ...
$ SaleOrderStatus : chr "COMPLETE" "COMPLETE" "COMPLETE" "COMPLETE" ...
$ SaleOrderItemStatus : chr "DELIVERED" "DISPATCHED" "DELIVERED" "DISPATCHED" ...
$ ShippingProvider : chr "DELHIVERY" "JAVAS" "DELHIVERY" "JAVAS" ...
$ Dispatch Date : chr "4/14/15 18:30" "4/16/15 18:30" "4/14/15 20:20" "4/16/15 17:52" ...
$ ReturnDate : logi NA NA NA NA NA NA ...
$ ReceiveDate : logi NA NA NA NA NA NA ...
$ BundlesDiscount : num 0 0 0 0 0 0 0 0 0 0 ...
$ Payment Gateway : chr "null" "null" "null" "null" ...
$ ActualPayment_Mode : chr "COD" "COD" "COD" "COD" ...
- attr(*, ".internal.selfref")=<externalptr>
# convert 'Weekend' as a factor
apparels10[, Weekend := as.factor(Weekend)]
# convert 'SubCategory' as a factor
apparels10[,SubCategory := as.factor(SubCategory)]
# convert 'SubCategoryOld' as a factor
apparels10[, SubCategoryOld := as.factor(SubCategoryOld)]
# convert 'Brand' as a factor
apparels10[, Brand := as.factor(Brand)]
# convert 'VendorDiscount' as a factor
apparels10[, VendorDiscount := as.factor(VendorDiscount)]
# convert 'WebsiteDiscount' as a factor
apparels10[, WebsiteDiscount := as.factor(WebsiteDiscount)]
# convert 'COD' as a factor
apparels10[, COD := as.factor(COD)]
# convert 'Metro' as a factor
apparels10[, Metro := as.factor(Metro)]
Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
Factor w/ 8 levels "DRESSES","KURTAS",..: 1 1 1 1 1 1 1 1 1 1 ...
Factor w/ 59 levels "109•À_F","AABOLI",..: 48 48 48 48 48 48 48 48 48 48 ...
Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 1 ...
Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
Factor w/ 2 levels "No","Yes": 2 1 1 1 1 2 2 2 1 1 ...
# descriptive statistics
library(psych)
describe(apparels10)[c(5,8:13,23,27),c(2:5,8:9)]
n mean sd median min max
Weekend* 154793 1.32 0.47 1 1.00 2
SubCategory* 154793 4.97 2.38 5 1.00 8
Brand* 154793 27.92 16.01 26 1.00 59
MRP 154793 1160.04 730.91 999 249.00 17498
VendorDiscount* 154793 1.65 0.48 2 1.00 2
WebsiteDiscount* 154793 1.21 0.41 1 1.00 2
FinalTotalPrice 154793 745.03 390.55 650 100.13 8624
COD* 154793 1.61 0.49 2 1.00 2
Metro* 154793 1.35 0.48 1 1.00 2
Reserve 80% for training and 20% of test
# loading the package
library(caTools)
# fixing the observations
set.seed(123)
# splitting the data
split = sample.split(apparels10$COD, SplitRatio = 0.75)
# creating the training set
trainingSet = subset(apparels10, split == TRUE)
# creating the test set
testSet = subset(apparels10, split == FALSE)
# dimensions of the full data
dim(apparels10)
[1] 154793 46
# dimensions of the training data
dim(trainingSet)
[1] 116095 46
# dimensions of the Testing data
dim(testSet)
[1] 38698 46
# proportion of COD in full data
round(prop.table(table(apparels10$COD))*100,4)
No Yes
39.1381 60.8619
# proportion of COD in train data
round(prop.table(table(trainingSet$COD))*100,4)
No Yes
39.1378 60.8622
# proportion of COD in test data
round(prop.table(table(testSet$COD))*100,4)
No Yes
39.139 60.861
library(caret)
# control parameters
objControl <- trainControl(method = "boot",
number = 2,
returnResamp = 'none',
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
set.seed(766)
# model building using caret package
S0 <- train(COD ~ WebsiteDiscount
+ VendorDiscount,
data = trainingSet,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",verbose = FALSE)
S0$finalModel
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4832 -1.4043 0.8998 0.9664 1.2237
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.69521 0.01225 56.75 <2e-16 ***
WebsiteDiscountYes -0.62720 0.01546 -40.56 <2e-16 ***
VendorDiscountYes -0.17613 0.01361 -12.94 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155419 on 116094 degrees of freedom
Residual deviance: 153769 on 116092 degrees of freedom
AIC: 153775
Number of Fisher Scoring iterations: 4
WD VD Pr(COD)
A No No 66.7
B No Yes 62.7
C Yes No 51.7
D Yes Yes 47.3
set.seed(766)
# model building using caret package
S1 <- train(COD ~ WebsiteDiscount * VendorDiscount
,
data = trainingSet,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",verbose = FALSE)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4625 -1.4114 0.9170 0.9602 1.2755
Coefficients:
Estimate Std. Error z value
(Intercept) 0.64901 0.01352 47.993
WebsiteDiscountYes -0.51957 0.02069 -25.107
VendorDiscountYes -0.11390 0.01571 -7.248
`WebsiteDiscountYes:VendorDiscountYes` -0.24321 0.03117 -7.802
Pr(>|z|)
(Intercept) < 2e-16 ***
WebsiteDiscountYes < 2e-16 ***
VendorDiscountYes 4.22e-13 ***
`WebsiteDiscountYes:VendorDiscountYes` 6.09e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155419 on 116094 degrees of freedom
Residual deviance: 153708 on 116091 degrees of freedom
AIC: 153716
Number of Fisher Scoring iterations: 4
WD VD Pr(COD)
A No No 65.7
B No Yes 63.1
C Yes No 53.2
D Yes Yes 44.3
Pr(COD) from Model S0
WD VD Pr(COD)
A No No 66.7
B No Yes 62.7
C Yes No 51.7
D Yes Yes 47.3
Pr(COD) from Model S1
WD VD Pr(COD)
A No No 65.7
B No Yes 63.1
C Yes No 53.2
D Yes Yes 44.3
set.seed(766)
# model building using caret package
M0 <- train(COD ~ MRP
+ WebsiteDiscount
+ VendorDiscount
+ Weekend
+ Metro
+ SubCategory,
data = trainingSet,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",verbose = FALSE)
# summary of the model
M1 <- M0$finalModel
summary(M1)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0148 -1.3041 0.8646 0.9890 1.4198
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.283e-01 2.828e-02 32.822 < 2e-16 ***
MRP 7.360e-05 1.063e-05 6.925 4.37e-12 ***
WebsiteDiscountYes -6.128e-01 1.578e-02 -38.822 < 2e-16 ***
VendorDiscountYes -2.382e-01 1.482e-02 -16.076 < 2e-16 ***
WeekendYes -1.032e-01 1.300e-02 -7.945 1.95e-15 ***
MetroYes -2.322e-01 1.272e-02 -18.257 < 2e-16 ***
SubCategoryKURTAS -2.834e-01 2.620e-02 -10.814 < 2e-16 ***
SubCategoryKURTIS -2.363e-01 2.839e-02 -8.324 < 2e-16 ***
SubCategoryPANTS -1.387e-01 3.271e-02 -4.241 2.22e-05 ***
SubCategorySAREES 1.431e-01 2.674e-02 5.351 8.76e-08 ***
SubCategorySHIRTS -3.545e-01 2.594e-02 -13.665 < 2e-16 ***
`SubCategoryT-SHIRTS` -1.406e-01 2.569e-02 -5.473 4.43e-08 ***
SubCategoryTOPS -2.385e-01 2.564e-02 -9.301 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155419 on 116094 degrees of freedom
Residual deviance: 152559 on 116082 degrees of freedom
AIC: 152585
Number of Fisher Scoring iterations: 4
set.seed(766)
# model building using caret package
M2 <- train(COD ~ MRP * WebsiteDiscount * VendorDiscount
+ Weekend * MRP
+ Metro * MRP
+ SubCategory,
data = trainingSet,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",verbose = FALSE)
# summary of the model
M3 <- M2$finalModel
summary(M3)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2249 -1.3144 0.8759 0.9908 1.7071
Coefficients:
Estimate Std. Error z value
(Intercept) 6.277e-01 4.375e-02 14.349
MRP 3.752e-04 4.396e-05 8.535
WebsiteDiscountYes -7.075e-01 5.542e-02 -12.766
VendorDiscountYes 5.851e-02 3.760e-02 1.556
WeekendYes -9.664e-02 1.306e-02 -7.402
MetroYes -1.932e-01 2.459e-02 -7.857
SubCategoryKURTAS -2.564e-01 2.637e-02 -9.721
SubCategoryKURTIS -1.822e-01 2.875e-02 -6.337
SubCategoryPANTS -1.213e-01 3.285e-02 -3.692
SubCategorySAREES 1.848e-01 2.699e-02 6.849
SubCategorySHIRTS -3.432e-01 2.603e-02 -13.187
`SubCategoryT-SHIRTS` -7.389e-02 2.629e-02 -2.811
SubCategoryTOPS -2.254e-01 2.571e-02 -8.769
`MRP:WebsiteDiscountYes` 1.007e-04 5.886e-05 1.711
`MRP:VendorDiscountYes` -3.108e-04 4.414e-05 -7.042
`WebsiteDiscountYes:VendorDiscountYes` 2.385e-01 7.347e-02 3.246
`MRP:MetroYes` -3.478e-05 1.872e-05 -1.858
`MRP:WebsiteDiscountYes:VendorDiscountYes` -3.012e-04 6.776e-05 -4.445
Pr(>|z|)
(Intercept) < 2e-16 ***
MRP < 2e-16 ***
WebsiteDiscountYes < 2e-16 ***
VendorDiscountYes 0.119719
WeekendYes 1.35e-13 ***
MetroYes 3.93e-15 ***
SubCategoryKURTAS < 2e-16 ***
SubCategoryKURTIS 2.34e-10 ***
SubCategoryPANTS 0.000222 ***
SubCategorySAREES 7.45e-12 ***
SubCategorySHIRTS < 2e-16 ***
`SubCategoryT-SHIRTS` 0.004942 **
SubCategoryTOPS < 2e-16 ***
`MRP:WebsiteDiscountYes` 0.087111 .
`MRP:VendorDiscountYes` 1.90e-12 ***
`WebsiteDiscountYes:VendorDiscountYes` 0.001169 **
`MRP:MetroYes` 0.063222 .
`MRP:WebsiteDiscountYes:VendorDiscountYes` 8.78e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 155419 on 116094 degrees of freedom
Residual deviance: 152329 on 116077 degrees of freedom
AIC: 152365
Number of Fisher Scoring iterations: 4
# predicted probabilities
predProbTest <- predict(M2, testSet[,c(5,8,10:12,27)], type = "prob")
# plot of probabilities
plot(predProbTest$Yes,
main = "Scatterplot of Probabilities of COD (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of COD")
# minimum value of predicted probability
min(predProbTest$Yes)
[1] 0.1213258
# maximum value of predicted probability
max(predProbTest$Yes)
[1] 0.982933
# mean value of predicted probability
mean(predProbTest$Yes)
[1] 0.6085065
library(caret)
predProbTestM0 <- predict(M0, testSet[,c(5,8,10:12,27)], type = "prob")
# confusion matrix using caret package
yPredM0 <- ifelse(predProbTestM0$Yes > 0.5, "Yes", "No")
predYM0 <- as.factor(yPredM0)
confusionMatrix(data = predYM0, reference = testSet$COD, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2118 1898
Yes 13028 21654
Accuracy : 0.6143
95% CI : (0.6094, 0.6191)
No Information Rate : 0.6086
P-Value [Acc > NIR] : 0.01107
Kappa : 0.0682
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.9194
Specificity : 0.1398
Pos Pred Value : 0.6244
Neg Pred Value : 0.5274
Prevalence : 0.6086
Detection Rate : 0.5596
Detection Prevalence : 0.8962
Balanced Accuracy : 0.5296
'Positive' Class : Yes
library(caret)
predProbTestM2 <- predict(M2, testSet[,c(5,8,10:12,27)], type = "prob")
# confusion matrix using caret package
yPredM2 <- ifelse(predProbTestM2$Yes > 0.5, "Yes", "No")
predYM2 <- as.factor(yPredM2)
confusionMatrix(data = predYM2, reference = testSet$COD, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2249 1935
Yes 12897 21617
Accuracy : 0.6167
95% CI : (0.6119, 0.6216)
No Information Rate : 0.6086
P-Value [Acc > NIR] : 0.0005397
Kappa : 0.0762
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9178
Specificity : 0.1485
Pos Pred Value : 0.6263
Neg Pred Value : 0.5375
Prevalence : 0.6086
Detection Rate : 0.5586
Detection Prevalence : 0.8919
Balanced Accuracy : 0.5332
'Positive' Class : Yes
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2118 1898
Yes 13028 21654
Accuracy : 0.6143
95% CI : (0.6094, 0.6191)
No Information Rate : 0.6086
P-Value [Acc > NIR] : 0.01107
Kappa : 0.0682
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.9194
Specificity : 0.1398
Pos Pred Value : 0.6244
Neg Pred Value : 0.5274
Prevalence : 0.6086
Detection Rate : 0.5596
Detection Prevalence : 0.8962
Balanced Accuracy : 0.5296
'Positive' Class : Yes
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2249 1935
Yes 12897 21617
Accuracy : 0.6167
95% CI : (0.6119, 0.6216)
No Information Rate : 0.6086
P-Value [Acc > NIR] : 0.0005397
Kappa : 0.0762
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9178
Specificity : 0.1485
Pos Pred Value : 0.6263
Neg Pred Value : 0.5375
Prevalence : 0.6086
Detection Rate : 0.5586
Detection Prevalence : 0.8919
Balanced Accuracy : 0.5332
'Positive' Class : Yes
# extracting AIC
M0$finalModel$aic
[1] 152585
# extracting AIC
M2$finalModel$aic
[1] 152364.5