The dataset refers to clients/customers of a wholesale distributor. It includes the annual spending in monetary units on diverse product categories (Data source: UCI Machine Learning https://archive.ics.uci.edu/ml/datasets/Wholesale+customers). In this case, we would like to see the prediction of channel ‘Horeca’ is correctly predicted.
library(dplyr)
library(caret)
library(readr)
library(class)
library(gtools)
library(tidyverse)
library(inspectdf)
library(MASS)
library(plotly)wsLBB <- read.csv("data_input/wholesale.csv")
wsLBBData description:
FRESH: annual spending on fresh products.
MILK: annual spending on milk products.
GROCERY: annual spending on grocery products.
FROZEN: annual spending on frozen products.
DETERGENTS_PAPER: annual spending on detergents and paper products.
DELICATESSEN: annual spending on and delicatessen products.
CHANNEL: customers Channel - Horeca (Hotel/Restaurant/Cafe) or Retail channel.
colSums(is.na(wsLBB))## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
wsLBB <- wsLBB %>%
select(-Region) %>%
mutate(Channel = as.factor(Channel))
wsLBBnum <- inspect_num(wsLBB)
cat <- inspect_cat(wsLBB)
show_plot(num); show_plot(cat)prop.table(table(wsLBB$Channel))##
## 1 2
## 0.6772727 0.3227273
set.seed(200)
index <- sample(nrow(wsLBB),0.8*nrow(wsLBB))
train_wsLBB <- wsLBB[index,]
test_wsLBB <- wsLBB[-index,]train_wsLBB.new <- upSample(x = train_wsLBB[, -1],
y = train_wsLBB$Channel,
yname = "Channel")prop.table(table(train_wsLBB.new$Channel))##
## 1 2
## 0.5 0.5
wsLBB_model <- glm(Channel~.,train_wsLBB.new, family='binomial')## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
step(wsLBB_model, method="backward")## Start: AIC=264.49
## Channel ~ Fresh + Milk + Grocery + Frozen + Detergents_Paper +
## Delicassen
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - Milk 1 251.05 263.05
## - Fresh 1 251.25 263.25
## - Delicassen 1 251.42 263.42
## <none> 250.49 264.49
## - Frozen 1 256.41 268.41
## - Grocery 1 256.52 268.52
## - Detergents_Paper 1 309.65 321.65
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=263.05
## Channel ~ Fresh + Grocery + Frozen + Detergents_Paper + Delicassen
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - Delicassen 1 251.64 261.64
## - Fresh 1 251.96 261.96
## <none> 251.05 263.05
## - Grocery 1 256.64 266.64
## - Frozen 1 257.52 267.52
## - Detergents_Paper 1 309.89 319.89
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=261.64
## Channel ~ Fresh + Grocery + Frozen + Detergents_Paper
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - Fresh 1 252.88 260.88
## <none> 251.64 261.64
## - Frozen 1 257.54 265.54
## - Grocery 1 258.67 266.67
## - Detergents_Paper 1 310.18 318.18
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=260.88
## Channel ~ Grocery + Frozen + Detergents_Paper
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 252.88 260.88
## - Frozen 1 257.76 263.76
## - Grocery 1 261.06 267.06
## - Detergents_Paper 1 310.28 316.28
##
## Call: glm(formula = Channel ~ Grocery + Frozen + Detergents_Paper,
## family = "binomial", data = train_wsLBB.new)
##
## Coefficients:
## (Intercept) Grocery Frozen Detergents_Paper
## -3.0537035 0.0001570 -0.0001134 0.0008759
##
## Degrees of Freedom: 483 Total (i.e. Null); 480 Residual
## Null Deviance: 671
## Residual Deviance: 252.9 AIC: 260.9
wsLBB_model <- glm(Channel ~ Grocery + Frozen + Detergents_Paper,
family = "binomial", data = train_wsLBB.new)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
test_wsLBB$prob <- predict(wsLBB_model, test_wsLBB, type = "response")
head(test_wsLBB)hist(test_wsLBB$prob)test_wsLBB$predict <- as.factor(ifelse(test_wsLBB$prob >= 0.5, 2, 1))
head(test_wsLBB)test_wsLBB$predict2 <- as.factor(ifelse(test_wsLBB$prob >= 0.4, 2, 1))
head(test_wsLBB)** positive class = 1 **
confusionMatrix(test_wsLBB$predict, test_wsLBB$Channel, positive ='1')## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 50 5
## 2 6 27
##
## Accuracy : 0.875
## 95% CI : (0.7873, 0.9359)
## No Information Rate : 0.6364
## P-Value [Acc > NIR] : 4.749e-07
##
## Kappa : 0.7317
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8929
## Specificity : 0.8438
## Pos Pred Value : 0.9091
## Neg Pred Value : 0.8182
## Prevalence : 0.6364
## Detection Rate : 0.5682
## Detection Prevalence : 0.6250
## Balanced Accuracy : 0.8683
##
## 'Positive' Class : 1
##
confusionMatrix(test_wsLBB$predict2, test_wsLBB$Channel, positive ='1')## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 49 3
## 2 7 29
##
## Accuracy : 0.8864
## 95% CI : (0.8009, 0.9441)
## No Information Rate : 0.6364
## P-Value [Acc > NIR] : 1.136e-07
##
## Kappa : 0.7609
##
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.8750
## Specificity : 0.9062
## Pos Pred Value : 0.9423
## Neg Pred Value : 0.8056
## Prevalence : 0.6364
## Detection Rate : 0.5568
## Detection Prevalence : 0.5909
## Balanced Accuracy : 0.8906
##
## 'Positive' Class : 1
##
Note: After changing the threshold from 0.5 to 0.4, the precision of the model is raise from 90.91 & to 94.23%
wsKNN <- read.csv("data_input/wholesale.csv")
wsKNNsapply(wsKNN, function(x) {sum(is.na(x))})## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
wsKNN <- wsKNN %>%
select(-Region) %>%
mutate(Channel = factor(Channel,
levels=c(1,2),
labels=c("horeca","retail")))
wsKNNwsKNN.sc <- wsLBB %>%
mutate_if(is.integer, scale)range(wsKNN[,2:7]) # before scaling## [1] 3 112151
range(wsKNN.sc[,2:7]) # after scaling## [1] -0.9486033 16.4597113
set.seed(200)
index <- sample(nrow(wsKNN.sc),0.8*nrow(wsKNN.sc))
train_wsKNN.sc <- wsKNN.sc[index,]
test_wsKNN.sc <- wsKNN.sc[-index,]sqrt(nrow(train_wsKNN.sc)) ## [1] 18.76166
–> from sqrt(), the value of k = 18.76
wsKNN_pred <- knn(train=train_wsKNN.sc[,-1],
test=test_wsKNN.sc[,-1],
cl=train_wsKNN.sc$Channel,
19)wsKNN_pred2 <- knn(train=train_wsKNN.sc[,-1],
test=test_wsKNN.sc[,-1],
cl=train_wsKNN.sc$Channel,
17)Positive class in this case is 1 = horeca
k=19
confusionMatrix(wsKNN_pred,
test_wsKNN.sc$Channel, positive = '1')## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 53 4
## 2 3 28
##
## Accuracy : 0.9205
## 95% CI : (0.843, 0.9674)
## No Information Rate : 0.6364
## P-Value [Acc > NIR] : 7.867e-10
##
## Kappa : 0.827
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9464
## Specificity : 0.8750
## Pos Pred Value : 0.9298
## Neg Pred Value : 0.9032
## Prevalence : 0.6364
## Detection Rate : 0.6023
## Detection Prevalence : 0.6477
## Balanced Accuracy : 0.9107
##
## 'Positive' Class : 1
##
confusionMatrix(wsKNN_pred2,
test_wsKNN.sc$Channel, positive = '1')## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 53 4
## 2 3 28
##
## Accuracy : 0.9205
## 95% CI : (0.843, 0.9674)
## No Information Rate : 0.6364
## P-Value [Acc > NIR] : 7.867e-10
##
## Kappa : 0.827
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9464
## Specificity : 0.8750
## Pos Pred Value : 0.9298
## Neg Pred Value : 0.9032
## Prevalence : 0.6364
## Detection Rate : 0.6023
## Detection Prevalence : 0.6477
## Balanced Accuracy : 0.9107
##
## 'Positive' Class : 1
##
Our target of the analysis is to see the prediction of customers channel ‘Horeca’ is correctly predicted, in order to make sure that we do the right strategy to the correct channel to increase horeca channel in the population (i.e by giving special price to the variables/items to horeca channel). For this case, we will see the precision value from the analysis result.
Based on the regression method above, precision result using logistic regression with threshold = 0.4 is higher than KNN algorithm.
The precision value results: