Predict delivery delay by logistic regression & Random Forest

Dataset: E-Commerce Shipping https://www.kaggle.com/code/bahaddinyavuz/e-commerce-shipping-visualization/data
library(tidyverse)
library(readxlsb)
library(readxl)
library(xlsx)
library(hrbrthemes)
library(caTools)
pacman::p_load(tidyverse, caret, corrplot, caTools, knitr, car,
               ROCR, IRdisplay, e1071,earth, fastDummies)

NOTE Set working directory (Need to change to current directory)

setwd("C:/Users/nsk_z/OneDrive/Desktop/Data Science/1. Ecommerce Shipping Visualization")

NOTE: This one must be local location.

shipping <- read.csv(file = 'Train.csv')

head(shipping)
##   ID Warehouse_block Mode_of_Shipment Customer_care_calls Customer_rating
## 1  1               D           Flight                   4               2
## 2  2               F           Flight                   4               5
## 3  3               A           Flight                   2               2
## 4  4               B           Flight                   3               3
## 5  5               C           Flight                   2               2
## 6  6               F           Flight                   3               1
##   Cost_of_the_Product Prior_purchases Product_importance Gender
## 1                 177               3                low      F
## 2                 216               2                low      M
## 3                 183               4                low      M
## 4                 176               4             medium      M
## 5                 184               3             medium      F
## 6                 162               3             medium      F
##   Discount_offered Weight_in_gms Reached.on.Time_Y.N
## 1               44          1233                   1
## 2               59          3088                   1
## 3               48          3374                   1
## 4               10          1177                   1
## 5               46          2484                   1
## 6               12          1417                   1
summary(shipping)
##        ID        Warehouse_block    Mode_of_Shipment   Customer_care_calls
##  Min.   :    1   Length:10999       Length:10999       Min.   :2.000      
##  1st Qu.: 2750   Class :character   Class :character   1st Qu.:3.000      
##  Median : 5500   Mode  :character   Mode  :character   Median :4.000      
##  Mean   : 5500                                         Mean   :4.054      
##  3rd Qu.: 8250                                         3rd Qu.:5.000      
##  Max.   :10999                                         Max.   :7.000      
##  Customer_rating Cost_of_the_Product Prior_purchases  Product_importance
##  Min.   :1.000   Min.   : 96.0       Min.   : 2.000   Length:10999      
##  1st Qu.:2.000   1st Qu.:169.0       1st Qu.: 3.000   Class :character  
##  Median :3.000   Median :214.0       Median : 3.000   Mode  :character  
##  Mean   :2.991   Mean   :210.2       Mean   : 3.568                     
##  3rd Qu.:4.000   3rd Qu.:251.0       3rd Qu.: 4.000                     
##  Max.   :5.000   Max.   :310.0       Max.   :10.000                     
##     Gender          Discount_offered Weight_in_gms  Reached.on.Time_Y.N
##  Length:10999       Min.   : 1.00    Min.   :1001   Min.   :0.0000     
##  Class :character   1st Qu.: 4.00    1st Qu.:1840   1st Qu.:0.0000     
##  Mode  :character   Median : 7.00    Median :4149   Median :1.0000     
##                     Mean   :13.37    Mean   :3634   Mean   :0.5967     
##                     3rd Qu.:10.00    3rd Qu.:5050   3rd Qu.:1.0000     
##                     Max.   :65.00    Max.   :7846   Max.   :1.0000
class(shipping)
## [1] "data.frame"
sapply(shipping, class)
##                  ID     Warehouse_block    Mode_of_Shipment Customer_care_calls 
##           "integer"         "character"         "character"           "integer" 
##     Customer_rating Cost_of_the_Product     Prior_purchases  Product_importance 
##           "integer"           "integer"           "integer"         "character" 
##              Gender    Discount_offered       Weight_in_gms Reached.on.Time_Y.N 
##         "character"           "integer"           "integer"           "integer"
Clean up the data
Change the data type
Convert “Warehouse_block” to factor
Convert “Mode_of_Shipment” to factor
Convert “Product_importance” to factor
Convert “Gender” to factory
Convert “Reached.on.Time_Y.N” to factor
factor_cols <- c("Warehouse_block","Mode_of_Shipment","Product_importance","Gender","Reached.on.Time_Y.N")
shipping[factor_cols] <- lapply(shipping[factor_cols], as.factor)
sapply(shipping,class)
##                  ID     Warehouse_block    Mode_of_Shipment Customer_care_calls 
##           "integer"            "factor"            "factor"           "integer" 
##     Customer_rating Cost_of_the_Product     Prior_purchases  Product_importance 
##           "integer"           "integer"           "integer"            "factor" 
##              Gender    Discount_offered       Weight_in_gms Reached.on.Time_Y.N 
##            "factor"           "integer"           "integer"            "factor"
EDA
1. Which Warehouse contribute to delay ??
shipping %>%
  group_by(Warehouse_block) %>%
  filter(Reached.on.Time_Y.N == 1) %>%
  summarise(total_delay = n()) %>%
  ggplot(aes(x= Warehouse_block, y = total_delay)) +
  geom_bar(stat = 'identity')

##### 2. Which Shipment mode contribute to delay ??

shipping %>%
  group_by(Mode_of_Shipment) %>%
  filter(Reached.on.Time_Y.N == 1) %>%
  summarise(total_delay = n()) %>%
  ggplot(aes(x= Mode_of_Shipment, y = total_delay)) +
  geom_bar(stat = 'identity')

##### 3. Which product importance contribute to delay ??

shipping %>%
  group_by(Product_importance) %>%
  filter(Reached.on.Time_Y.N == 1) %>%
  summarise(total_delay = n()) %>%
  ggplot(aes(x= Product_importance, y = total_delay)) +
  geom_bar(stat = 'identity')

##### 4. Weight distribution contribution to delay and on time ##### Findings: lighter weight contribute to delay whereas more heavier less delay

p1 <- ggplot(data=shipping, aes(x=Weight_in_gms, group=Reached.on.Time_Y.N, fill=Reached.on.Time_Y.N)) +
  geom_density(adjust=1.5, alpha=.4) +
  theme_ipsum()
p1

##### 5. Cost distribution contribution to delay

p2 <- shipping %>%
  filter(Reached.on.Time_Y.N == 1) %>%
  ggplot(aes(x=Cost_of_the_Product)) +
  geom_density(adjust = 1.5)
p2

##### 6. The more discount the more delay ?? ##### The more amount of discount offered, the lesser the delay.

p3 <- shipping %>%
  filter(Reached.on.Time_Y.N == 1) %>%
  ggplot(aes(x=Discount_offered)) +
  geom_density(adjust = 1.5)
p3

##### Check target variable whether need to

shipping %>%
  group_by(Reached.on.Time_Y.N) %>%
  summarise(per = n()/nrow(shipping)) %>%
  ggplot(aes(x=Reached.on.Time_Y.N, y = per, fill = Reached.on.Time_Y.N)) +
  geom_bar(stat = 'identity') +
  geom_text(aes(label = round(per,2)), vjust =2)

##### Split Train and test set

set.seed(123)
split <- sample.split(shipping$Reached.on.Time_Y.N, SplitRatio = 0.7)
trainset <- subset(shipping, split == TRUE)
testset <- subset(shipping, split == FALSE)
dim(trainset)
## [1] 7699   12
dim(testset)
## [1] 3300   12
First Modelling to predict delay delivery (logistic Regression)
model <- glm(Reached.on.Time_Y.N~ .-ID ,  data = trainset, family = binomial)
summary(model)
## 
## Call:
## glm(formula = Reached.on.Time_Y.N ~ . - ID, family = binomial, 
##     data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7117  -1.0801   0.1242   1.0882   1.8396  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               1.528e+00  2.553e-01   5.982 2.20e-09 ***
## Warehouse_blockB         -3.235e-02  9.031e-02  -0.358 0.720226    
## Warehouse_blockC         -2.958e-02  8.987e-02  -0.329 0.742078    
## Warehouse_blockD          2.412e-02  8.941e-02   0.270 0.787327    
## Warehouse_blockF          1.099e-02  7.732e-02   0.142 0.886944    
## Mode_of_ShipmentRoad     -4.382e-02  9.202e-02  -0.476 0.633899    
## Mode_of_ShipmentShip     -4.140e-02  7.184e-02  -0.576 0.564418    
## Customer_care_calls      -1.023e-01  2.572e-02  -3.976 6.99e-05 ***
## Customer_rating           2.306e-02  1.851e-02   1.246 0.212796    
## Cost_of_the_Product      -2.033e-03  5.965e-04  -3.408 0.000655 ***
## Prior_purchases          -4.763e-02  1.783e-02  -2.671 0.007552 ** 
## Product_importancelow    -3.547e-01  1.002e-01  -3.542 0.000397 ***
## Product_importancemedium -3.830e-01  1.004e-01  -3.816 0.000136 ***
## GenderM                   6.587e-02  5.216e-02   1.263 0.206623    
## Discount_offered          1.142e-01  5.405e-03  21.120  < 2e-16 ***
## Weight_in_gms            -2.237e-04  1.923e-05 -11.633  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10383.3  on 7698  degrees of freedom
## Residual deviance:  8413.8  on 7683  degrees of freedom
## AIC: 8445.8
## 
## Number of Fisher Scoring iterations: 6
vif(model)
##                         GVIF Df GVIF^(1/(2*Df))
## Warehouse_block     1.005738  4        1.000715
## Mode_of_Shipment    1.002491  2        1.000622
## Customer_care_calls 1.308477  1        1.143887
## Customer_rating     1.001042  1        1.000521
## Cost_of_the_Product 1.204841  1        1.097652
## Prior_purchases     1.087398  1        1.042784
## Product_importance  1.027218  2        1.006736
## Gender              1.002719  1        1.001359
## Discount_offered    1.031445  1        1.015601
## Weight_in_gms       1.402464  1        1.184257
Predict the response by using trainset and check accuracy
trainpredict = predict(model, newdata = trainset, type = 'response')
p_class = ifelse(trainpredict>0.5, 1, 0)
matrix_table = table(trainset$Reached.on.Time_Y.N, p_class)
matrix_table
##    p_class
##        0    1
##   0 1801 1304
##   1 1445 3149
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)
## [1] 0.643
varImp(model)
##                             Overall
## Warehouse_blockB          0.3581562
## Warehouse_blockC          0.3291027
## Warehouse_blockD          0.2697834
## Warehouse_blockF          0.1421722
## Mode_of_ShipmentRoad      0.4762467
## Mode_of_ShipmentShip      0.5762910
## Customer_care_calls       3.9764640
## Customer_rating           1.2459149
## Cost_of_the_Product       3.4075511
## Prior_purchases           2.6714843
## Product_importancelow     3.5417752
## Product_importancemedium  3.8158895
## GenderM                   1.2629057
## Discount_offered         21.1204080
## Weight_in_gms            11.6330001
Try on testset
testPredict = predict(model, newdata = testset, type = 'response')
p_class = ifelse(testPredict>0.5, 1, 0)
matrix_table_test = table(testset$Reached.on.Time_Y.N, p_class)
matrix_table_test
##    p_class
##        0    1
##   0  747  584
##   1  627 1342
accuracy = sum(diag(matrix_table_test))/sum(matrix_table_test)
round(accuracy, 3)
## [1] 0.633
Lift Curve
pred = prediction(trainpredict, trainset$Reached.on.Time_Y.N)
perf = performance(pred, "lift", "rpp")
plot(perf, main = "lift curve", xlab = 'Proportion of Customers (sorted prob)')

gain = performance(pred, "tpr", "rpp")
plot(gain, col="orange", lwd = 2)

##### Try another model by removing some variables

model1 <- glm(Reached.on.Time_Y.N~ .-ID -Customer_rating -Gender -Mode_of_Shipment -Warehouse_block,  data = trainset, family = binomial)
summary(model1)
## 
## Call:
## glm(formula = Reached.on.Time_Y.N ~ . - ID - Customer_rating - 
##     Gender - Mode_of_Shipment - Warehouse_block, family = binomial, 
##     data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6867  -1.0799   0.1245   1.0883   1.8077  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               1.591e+00  2.354e-01   6.757 1.41e-11 ***
## Customer_care_calls      -1.011e-01  2.569e-02  -3.938 8.23e-05 ***
## Cost_of_the_Product      -2.031e-03  5.954e-04  -3.410 0.000649 ***
## Prior_purchases          -4.811e-02  1.781e-02  -2.701 0.006914 ** 
## Product_importancelow    -3.574e-01  1.000e-01  -3.572 0.000354 ***
## Product_importancemedium -3.876e-01  1.003e-01  -3.867 0.000110 ***
## Discount_offered          1.142e-01  5.404e-03  21.131  < 2e-16 ***
## Weight_in_gms            -2.235e-04  1.921e-05 -11.637  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10383  on 7698  degrees of freedom
## Residual deviance:  8418  on 7691  degrees of freedom
## AIC: 8434
## 
## Number of Fisher Scoring iterations: 6
vif(model1)
##                         GVIF Df GVIF^(1/(2*Df))
## Customer_care_calls 1.305398  1        1.142540
## Cost_of_the_Product 1.201254  1        1.096018
## Prior_purchases     1.086402  1        1.042306
## Product_importance  1.024668  2        1.006111
## Discount_offered    1.031418  1        1.015587
## Weight_in_gms       1.400066  1        1.183244
varImp(model1)
##                            Overall
## Customer_care_calls       3.937548
## Cost_of_the_Product       3.410308
## Prior_purchases           2.700958
## Product_importancelow     3.572404
## Product_importancemedium  3.866700
## Discount_offered         21.130622
## Weight_in_gms            11.636738
Predict the response by using trainset and check accuracy
trainpredict = predict(model1, newdata = trainset, type = 'response')
p_class = ifelse(trainpredict>0.5, 1, 0)
matrix_table = table(trainset$Reached.on.Time_Y.N, p_class)
matrix_table
##    p_class
##        0    1
##   0 1793 1312
##   1 1466 3128
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)
## [1] 0.639
Try on testset
testPredict = predict(model1, newdata = testset, type = 'response')
p_class = ifelse(testPredict>0.5, 1, 0)
matrix_table_test = table(testset$Reached.on.Time_Y.N, p_class)
matrix_table_test
##    p_class
##        0    1
##   0  751  580
##   1  633 1336
accuracy = sum(diag(matrix_table_test))/sum(matrix_table_test)
round(accuracy, 3)
## [1] 0.632

Try Random Forest
RandomForest Reference: https://uc-r.github.io/random_forests
library(randomForest)

rf_model <- randomForest(Reached.on.Time_Y.N~., data=trainset, proximity=TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = Reached.on.Time_Y.N ~ ., data = trainset,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 33.58%
## Confusion matrix:
##      0    1 class.error
## 0 2325  780   0.2512077
## 1 1805 2789   0.3929038
p1 <- predict(rf_model, testset)
confusionMatrix(p1, testset$Reached.on.Time_Y.N)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  984  788
##          1  347 1181
##                                           
##                Accuracy : 0.6561          
##                  95% CI : (0.6396, 0.6723)
##     No Information Rate : 0.5967          
##     P-Value [Acc > NIR] : 1.27e-12        
##                                           
##                   Kappa : 0.3218          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7393          
##             Specificity : 0.5998          
##          Pos Pred Value : 0.5553          
##          Neg Pred Value : 0.7729          
##              Prevalence : 0.4033          
##          Detection Rate : 0.2982          
##    Detection Prevalence : 0.5370          
##       Balanced Accuracy : 0.6695          
##                                           
##        'Positive' Class : 0               
## 
(Reference:https://www.rdocumentation.org/packages/ranger/versions/0.13.1/topics/ranger)
hyper_grid <- expand.grid(
  mtry = seq(2, 9, by = 1),
  node_size = seq(3,9, by = 2),
  OOB_RMSE = 0)
library(ranger)
for(i in 1:nrow(hyper_grid)) {
  
  rf_tune <- ranger(
    formula = Reached.on.Time_Y.N~ ., 
    data = trainset,
    num.trees = 200,
    mtry = hyper_grid$mtry[i],
    min.node.size = hyper_grid$node_size[i],
    seed = 123
  )
hyper_grid$OOB_RMSE <- sqrt(rf_tune$prediction.error)  
}

hyper_grid %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(10)
##    mtry node_size  OOB_RMSE
## 1     2         3 0.5772003
## 2     3         3 0.5772003
## 3     4         3 0.5772003
## 4     5         3 0.5772003
## 5     6         3 0.5772003
## 6     7         3 0.5772003
## 7     8         3 0.5772003
## 8     9         3 0.5772003
## 9     2         5 0.5772003
## 10    3         5 0.5772003