library(tidyverse)
library(readxlsb)
library(readxl)
library(xlsx)
library(hrbrthemes)
library(caTools)
pacman::p_load(tidyverse, caret, corrplot, caTools, knitr, car,
ROCR, IRdisplay, e1071,earth, fastDummies)
setwd("C:/Users/nsk_z/OneDrive/Desktop/Data Science/1. Ecommerce Shipping Visualization")
shipping <- read.csv(file = 'Train.csv')
head(shipping)
## ID Warehouse_block Mode_of_Shipment Customer_care_calls Customer_rating
## 1 1 D Flight 4 2
## 2 2 F Flight 4 5
## 3 3 A Flight 2 2
## 4 4 B Flight 3 3
## 5 5 C Flight 2 2
## 6 6 F Flight 3 1
## Cost_of_the_Product Prior_purchases Product_importance Gender
## 1 177 3 low F
## 2 216 2 low M
## 3 183 4 low M
## 4 176 4 medium M
## 5 184 3 medium F
## 6 162 3 medium F
## Discount_offered Weight_in_gms Reached.on.Time_Y.N
## 1 44 1233 1
## 2 59 3088 1
## 3 48 3374 1
## 4 10 1177 1
## 5 46 2484 1
## 6 12 1417 1
summary(shipping)
## ID Warehouse_block Mode_of_Shipment Customer_care_calls
## Min. : 1 Length:10999 Length:10999 Min. :2.000
## 1st Qu.: 2750 Class :character Class :character 1st Qu.:3.000
## Median : 5500 Mode :character Mode :character Median :4.000
## Mean : 5500 Mean :4.054
## 3rd Qu.: 8250 3rd Qu.:5.000
## Max. :10999 Max. :7.000
## Customer_rating Cost_of_the_Product Prior_purchases Product_importance
## Min. :1.000 Min. : 96.0 Min. : 2.000 Length:10999
## 1st Qu.:2.000 1st Qu.:169.0 1st Qu.: 3.000 Class :character
## Median :3.000 Median :214.0 Median : 3.000 Mode :character
## Mean :2.991 Mean :210.2 Mean : 3.568
## 3rd Qu.:4.000 3rd Qu.:251.0 3rd Qu.: 4.000
## Max. :5.000 Max. :310.0 Max. :10.000
## Gender Discount_offered Weight_in_gms Reached.on.Time_Y.N
## Length:10999 Min. : 1.00 Min. :1001 Min. :0.0000
## Class :character 1st Qu.: 4.00 1st Qu.:1840 1st Qu.:0.0000
## Mode :character Median : 7.00 Median :4149 Median :1.0000
## Mean :13.37 Mean :3634 Mean :0.5967
## 3rd Qu.:10.00 3rd Qu.:5050 3rd Qu.:1.0000
## Max. :65.00 Max. :7846 Max. :1.0000
class(shipping)
## [1] "data.frame"
sapply(shipping, class)
## ID Warehouse_block Mode_of_Shipment Customer_care_calls
## "integer" "character" "character" "integer"
## Customer_rating Cost_of_the_Product Prior_purchases Product_importance
## "integer" "integer" "integer" "character"
## Gender Discount_offered Weight_in_gms Reached.on.Time_Y.N
## "character" "integer" "integer" "integer"
factor_cols <- c("Warehouse_block","Mode_of_Shipment","Product_importance","Gender","Reached.on.Time_Y.N")
shipping[factor_cols] <- lapply(shipping[factor_cols], as.factor)
sapply(shipping,class)
## ID Warehouse_block Mode_of_Shipment Customer_care_calls
## "integer" "factor" "factor" "integer"
## Customer_rating Cost_of_the_Product Prior_purchases Product_importance
## "integer" "integer" "integer" "factor"
## Gender Discount_offered Weight_in_gms Reached.on.Time_Y.N
## "factor" "integer" "integer" "factor"
shipping %>%
group_by(Warehouse_block) %>%
filter(Reached.on.Time_Y.N == 1) %>%
summarise(total_delay = n()) %>%
ggplot(aes(x= Warehouse_block, y = total_delay)) +
geom_bar(stat = 'identity')
##### 2. Which Shipment mode contribute to delay ??
shipping %>%
group_by(Mode_of_Shipment) %>%
filter(Reached.on.Time_Y.N == 1) %>%
summarise(total_delay = n()) %>%
ggplot(aes(x= Mode_of_Shipment, y = total_delay)) +
geom_bar(stat = 'identity')
##### 3. Which product importance contribute to delay ??
shipping %>%
group_by(Product_importance) %>%
filter(Reached.on.Time_Y.N == 1) %>%
summarise(total_delay = n()) %>%
ggplot(aes(x= Product_importance, y = total_delay)) +
geom_bar(stat = 'identity')
##### 4. Weight distribution contribution to delay and on time #####
Findings: lighter weight contribute to delay whereas more heavier less
delay
p1 <- ggplot(data=shipping, aes(x=Weight_in_gms, group=Reached.on.Time_Y.N, fill=Reached.on.Time_Y.N)) +
geom_density(adjust=1.5, alpha=.4) +
theme_ipsum()
p1
##### 5. Cost distribution contribution to delay
p2 <- shipping %>%
filter(Reached.on.Time_Y.N == 1) %>%
ggplot(aes(x=Cost_of_the_Product)) +
geom_density(adjust = 1.5)
p2
##### 6. The more discount the more delay ?? ##### The more amount of
discount offered, the lesser the delay.
p3 <- shipping %>%
filter(Reached.on.Time_Y.N == 1) %>%
ggplot(aes(x=Discount_offered)) +
geom_density(adjust = 1.5)
p3
##### Check target variable whether need to
shipping %>%
group_by(Reached.on.Time_Y.N) %>%
summarise(per = n()/nrow(shipping)) %>%
ggplot(aes(x=Reached.on.Time_Y.N, y = per, fill = Reached.on.Time_Y.N)) +
geom_bar(stat = 'identity') +
geom_text(aes(label = round(per,2)), vjust =2)
##### Split Train and test set
set.seed(123)
split <- sample.split(shipping$Reached.on.Time_Y.N, SplitRatio = 0.7)
trainset <- subset(shipping, split == TRUE)
testset <- subset(shipping, split == FALSE)
dim(trainset)
## [1] 7699 12
dim(testset)
## [1] 3300 12
model <- glm(Reached.on.Time_Y.N~ .-ID , data = trainset, family = binomial)
summary(model)
##
## Call:
## glm(formula = Reached.on.Time_Y.N ~ . - ID, family = binomial,
## data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7117 -1.0801 0.1242 1.0882 1.8396
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.528e+00 2.553e-01 5.982 2.20e-09 ***
## Warehouse_blockB -3.235e-02 9.031e-02 -0.358 0.720226
## Warehouse_blockC -2.958e-02 8.987e-02 -0.329 0.742078
## Warehouse_blockD 2.412e-02 8.941e-02 0.270 0.787327
## Warehouse_blockF 1.099e-02 7.732e-02 0.142 0.886944
## Mode_of_ShipmentRoad -4.382e-02 9.202e-02 -0.476 0.633899
## Mode_of_ShipmentShip -4.140e-02 7.184e-02 -0.576 0.564418
## Customer_care_calls -1.023e-01 2.572e-02 -3.976 6.99e-05 ***
## Customer_rating 2.306e-02 1.851e-02 1.246 0.212796
## Cost_of_the_Product -2.033e-03 5.965e-04 -3.408 0.000655 ***
## Prior_purchases -4.763e-02 1.783e-02 -2.671 0.007552 **
## Product_importancelow -3.547e-01 1.002e-01 -3.542 0.000397 ***
## Product_importancemedium -3.830e-01 1.004e-01 -3.816 0.000136 ***
## GenderM 6.587e-02 5.216e-02 1.263 0.206623
## Discount_offered 1.142e-01 5.405e-03 21.120 < 2e-16 ***
## Weight_in_gms -2.237e-04 1.923e-05 -11.633 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10383.3 on 7698 degrees of freedom
## Residual deviance: 8413.8 on 7683 degrees of freedom
## AIC: 8445.8
##
## Number of Fisher Scoring iterations: 6
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Warehouse_block 1.005738 4 1.000715
## Mode_of_Shipment 1.002491 2 1.000622
## Customer_care_calls 1.308477 1 1.143887
## Customer_rating 1.001042 1 1.000521
## Cost_of_the_Product 1.204841 1 1.097652
## Prior_purchases 1.087398 1 1.042784
## Product_importance 1.027218 2 1.006736
## Gender 1.002719 1 1.001359
## Discount_offered 1.031445 1 1.015601
## Weight_in_gms 1.402464 1 1.184257
trainpredict = predict(model, newdata = trainset, type = 'response')
p_class = ifelse(trainpredict>0.5, 1, 0)
matrix_table = table(trainset$Reached.on.Time_Y.N, p_class)
matrix_table
## p_class
## 0 1
## 0 1801 1304
## 1 1445 3149
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)
## [1] 0.643
varImp(model)
## Overall
## Warehouse_blockB 0.3581562
## Warehouse_blockC 0.3291027
## Warehouse_blockD 0.2697834
## Warehouse_blockF 0.1421722
## Mode_of_ShipmentRoad 0.4762467
## Mode_of_ShipmentShip 0.5762910
## Customer_care_calls 3.9764640
## Customer_rating 1.2459149
## Cost_of_the_Product 3.4075511
## Prior_purchases 2.6714843
## Product_importancelow 3.5417752
## Product_importancemedium 3.8158895
## GenderM 1.2629057
## Discount_offered 21.1204080
## Weight_in_gms 11.6330001
testPredict = predict(model, newdata = testset, type = 'response')
p_class = ifelse(testPredict>0.5, 1, 0)
matrix_table_test = table(testset$Reached.on.Time_Y.N, p_class)
matrix_table_test
## p_class
## 0 1
## 0 747 584
## 1 627 1342
accuracy = sum(diag(matrix_table_test))/sum(matrix_table_test)
round(accuracy, 3)
## [1] 0.633
pred = prediction(trainpredict, trainset$Reached.on.Time_Y.N)
perf = performance(pred, "lift", "rpp")
plot(perf, main = "lift curve", xlab = 'Proportion of Customers (sorted prob)')
gain = performance(pred, "tpr", "rpp")
plot(gain, col="orange", lwd = 2)
##### Try another model by removing some variables
model1 <- glm(Reached.on.Time_Y.N~ .-ID -Customer_rating -Gender -Mode_of_Shipment -Warehouse_block, data = trainset, family = binomial)
summary(model1)
##
## Call:
## glm(formula = Reached.on.Time_Y.N ~ . - ID - Customer_rating -
## Gender - Mode_of_Shipment - Warehouse_block, family = binomial,
## data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6867 -1.0799 0.1245 1.0883 1.8077
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.591e+00 2.354e-01 6.757 1.41e-11 ***
## Customer_care_calls -1.011e-01 2.569e-02 -3.938 8.23e-05 ***
## Cost_of_the_Product -2.031e-03 5.954e-04 -3.410 0.000649 ***
## Prior_purchases -4.811e-02 1.781e-02 -2.701 0.006914 **
## Product_importancelow -3.574e-01 1.000e-01 -3.572 0.000354 ***
## Product_importancemedium -3.876e-01 1.003e-01 -3.867 0.000110 ***
## Discount_offered 1.142e-01 5.404e-03 21.131 < 2e-16 ***
## Weight_in_gms -2.235e-04 1.921e-05 -11.637 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10383 on 7698 degrees of freedom
## Residual deviance: 8418 on 7691 degrees of freedom
## AIC: 8434
##
## Number of Fisher Scoring iterations: 6
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## Customer_care_calls 1.305398 1 1.142540
## Cost_of_the_Product 1.201254 1 1.096018
## Prior_purchases 1.086402 1 1.042306
## Product_importance 1.024668 2 1.006111
## Discount_offered 1.031418 1 1.015587
## Weight_in_gms 1.400066 1 1.183244
varImp(model1)
## Overall
## Customer_care_calls 3.937548
## Cost_of_the_Product 3.410308
## Prior_purchases 2.700958
## Product_importancelow 3.572404
## Product_importancemedium 3.866700
## Discount_offered 21.130622
## Weight_in_gms 11.636738
trainpredict = predict(model1, newdata = trainset, type = 'response')
p_class = ifelse(trainpredict>0.5, 1, 0)
matrix_table = table(trainset$Reached.on.Time_Y.N, p_class)
matrix_table
## p_class
## 0 1
## 0 1793 1312
## 1 1466 3128
accuracy = sum(diag(matrix_table))/sum(matrix_table)
round(accuracy,3)
## [1] 0.639
testPredict = predict(model1, newdata = testset, type = 'response')
p_class = ifelse(testPredict>0.5, 1, 0)
matrix_table_test = table(testset$Reached.on.Time_Y.N, p_class)
matrix_table_test
## p_class
## 0 1
## 0 751 580
## 1 633 1336
accuracy = sum(diag(matrix_table_test))/sum(matrix_table_test)
round(accuracy, 3)
## [1] 0.632
library(randomForest)
rf_model <- randomForest(Reached.on.Time_Y.N~., data=trainset, proximity=TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = Reached.on.Time_Y.N ~ ., data = trainset, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 33.58%
## Confusion matrix:
## 0 1 class.error
## 0 2325 780 0.2512077
## 1 1805 2789 0.3929038
p1 <- predict(rf_model, testset)
confusionMatrix(p1, testset$Reached.on.Time_Y.N)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 984 788
## 1 347 1181
##
## Accuracy : 0.6561
## 95% CI : (0.6396, 0.6723)
## No Information Rate : 0.5967
## P-Value [Acc > NIR] : 1.27e-12
##
## Kappa : 0.3218
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7393
## Specificity : 0.5998
## Pos Pred Value : 0.5553
## Neg Pred Value : 0.7729
## Prevalence : 0.4033
## Detection Rate : 0.2982
## Detection Prevalence : 0.5370
## Balanced Accuracy : 0.6695
##
## 'Positive' Class : 0
##
hyper_grid <- expand.grid(
mtry = seq(2, 9, by = 1),
node_size = seq(3,9, by = 2),
OOB_RMSE = 0)
library(ranger)
for(i in 1:nrow(hyper_grid)) {
rf_tune <- ranger(
formula = Reached.on.Time_Y.N~ .,
data = trainset,
num.trees = 200,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$node_size[i],
seed = 123
)
hyper_grid$OOB_RMSE <- sqrt(rf_tune$prediction.error)
}
hyper_grid %>%
dplyr::arrange(OOB_RMSE) %>%
head(10)
## mtry node_size OOB_RMSE
## 1 2 3 0.5772003
## 2 3 3 0.5772003
## 3 4 3 0.5772003
## 4 5 3 0.5772003
## 5 6 3 0.5772003
## 6 7 3 0.5772003
## 7 8 3 0.5772003
## 8 9 3 0.5772003
## 9 2 5 0.5772003
## 10 3 5 0.5772003