Background

In this analysis, we will clasifying ‘channel’ from wholesale data. therefore, will used two models and comparing using Logistic regression model and KNN model. From the result of two models, will be chossen for the best model. Indicator of the best model will be compared the numbers of precision from them.

Import library

library(gtools)
library(gmodels)
library(ggplot2)
library(class)
library(grid)
library(dplyr)
library(caret)
library(GGally)

Import data and see summary of the data

wholesale <- read.csv("data_input/wholesale.csv")
glimpse(wholesale)
## Rows: 440
## Columns: 8
## $ Channel          <int> 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1…
## $ Region           <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ Fresh            <int> 12669, 7057, 6353, 13265, 22615, 9413, 12126, 7579, …
## $ Milk             <int> 9656, 9810, 8808, 1196, 5410, 8259, 3199, 4956, 3648…
## $ Grocery          <int> 7561, 9568, 7684, 4221, 7198, 5126, 6975, 9426, 6192…
## $ Frozen           <int> 214, 1762, 2405, 6404, 3915, 666, 480, 1669, 425, 11…
## $ Detergents_Paper <int> 2674, 3293, 3516, 507, 1777, 1795, 3140, 3321, 1716,…
## $ Delicassen       <int> 1338, 1776, 7844, 1788, 5185, 1451, 545, 2566, 750, …

Data Pre-processing

  • Converting Channel data into categorical type, and remove region data. Because Region will not be used as predictor.
wholesale <- wholesale %>% 
  mutate(Channel = as.factor(Channel)) %>% 
  select(-Region)
  • Check missing data
anyNA(wholesale)
## [1] FALSE

Exploratory Data Analysis

  • Breakdown the predictor data
library(purrr)
wholesale %>%
    select_if(is.numeric) %>%
    map_dbl(sum)
##            Fresh             Milk          Grocery           Frozen 
##          5280131          2550357          3498562          1351650 
## Detergents_Paper       Delicassen 
##          1267857           670943
  • Propotion of data Channel
wholesale %>% 
  group_by(Channel) %>% 
  summarise(total=n()) %>% 
  ungroup() %>% 
  mutate(proportion = total/sum(total)*100) 

The result, Channel 1 has a highest proportion from total order with 67.73%

ggcorr(wholesale, label = T)
## Warning in ggcorr(wholesale, label = T): data in column(s) 'Channel' are not
## numeric and were ignored

  • Split data set into train and test dataset
samplesize <- 0.8*nrow(wholesale)
index <- sample(seq_len(nrow(wholesale)),size = samplesize)

data_train <- wholesale[index,]
data_test <- wholesale[-index,]

Logistic Regression

Fitting Model

model_all <- glm(formula = Channel ~ . , data = data_train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_all)
## 
## Call:
## glm(formula = Channel ~ ., family = "binomial", data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.01024  -0.26780  -0.18011   0.02009   2.73672  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -4.545e+00  6.297e-01  -7.218 5.28e-13 ***
## Fresh             2.844e-05  2.273e-05   1.252   0.2107    
## Milk              1.258e-04  6.055e-05   2.078   0.0377 *  
## Grocery           1.135e-04  7.731e-05   1.467   0.1423    
## Frozen           -1.321e-04  9.374e-05  -1.410   0.1587    
## Detergents_Paper  1.030e-03  1.753e-04   5.877 4.17e-09 ***
## Delicassen       -1.773e-04  1.246e-04  -1.423   0.1546    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 435.65  on 351  degrees of freedom
## Residual deviance: 141.41  on 345  degrees of freedom
## AIC: 155.41
## 
## Number of Fisher Scoring iterations: 7

Predicting using logistic regression

pred_logit <- predict(model_all,newdata = data_test,type = "response")

rmarkdown::paged_table(head(as.data.frame(pred_logit),10))

determine the class based on the threshold 0.5

pred_class <- as.factor(if_else(pred_logit > 0.5, "1", "2"))

Evaluation Logistic regression model

# confusion matrix
eval_lr <- confusionMatrix(data = pred_class, reference = data_test$Channel, 
    positive = "1")
eval_lr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1  5 27
##          2 50  6
##                                           
##                Accuracy : 0.125           
##                  95% CI : (0.0641, 0.2127)
##     No Information Rate : 0.625           
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : -0.6383         
##                                           
##  Mcnemar's Test P-Value : 0.01217         
##                                           
##             Sensitivity : 0.09091         
##             Specificity : 0.18182         
##          Pos Pred Value : 0.15625         
##          Neg Pred Value : 0.10714         
##              Prevalence : 0.62500         
##          Detection Rate : 0.05682         
##    Detection Prevalence : 0.36364         
##       Balanced Accuracy : 0.13636         
##                                           
##        'Positive' Class : 1               
## 

KNN model

Cross validation

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
intrain <- sample(x = nrow(wholesale), size = 0.8*nrow(wholesale)) 
train_wholesale <- wholesale[intrain,]
test_wholesale <- wholesale[-intrain,]
prop.table(table(train_wholesale$Channel))
## 
##         1         2 
## 0.6846591 0.3153409

Scaling

# memisahkan prediktor (x) dan target (y) dengan menggunakan object yang berbeda
# x data train
train_x <- train_wholesale %>% 
   select_if(is.numeric)

# y data train
train_y <-  train_wholesale %>% 
   select(Channel)

# x data test
test_x <- test_wholesale %>% 
   select_if(is.numeric)

# y data test
test_y <- test_wholesale %>% 
   select(Channel)
# scaling x data train
train_x <- scale(train_x)

# scaling x data test
test_x <- scale(test_x, center = attr(train_x,"scaled:center"), 
                scale = attr(train_x,"scaled:center"))

Determine K value

sqrt(nrow(train_wholesale))
## [1] 18.76166
library(class)
pred_wholesale <- knn(train = train_x, test = test_x, cl = train_y$Channel, k = 19)
head(pred_wholesale)
## [1] 2 1 2 1 1 1
## Levels: 1 2

Evaluation model

library(caret)
eval_knn <- confusionMatrix(data = pred_wholesale, reference = test_y$Channel, positive = "1")
eval_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 55  6
##          2  2 25
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.8287, 0.9599)
##     No Information Rate : 0.6477          
##     P-Value [Acc > NIR] : 1.509e-08       
##                                           
##                   Kappa : 0.7948          
##                                           
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.9649          
##             Specificity : 0.8065          
##          Pos Pred Value : 0.9016          
##          Neg Pred Value : 0.9259          
##              Prevalence : 0.6477          
##          Detection Rate : 0.6250          
##    Detection Prevalence : 0.6932          
##       Balanced Accuracy : 0.8857          
##                                           
##        'Positive' Class : 1               
## 

Comparison model logistic regression & KNN

eval_lr <- data_frame(Accuracy = eval_lr$overall[1],
           Recall = eval_lr$byClass[1],
           Specificity = eval_lr$byClass[2],
           Precision = eval_lr$byClass[3])
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
eval_knn <- data_frame(Accuracy = eval_knn$overall[1],
           Recall = eval_knn$byClass[1],
           Specificity = eval_knn$byClass[2],
           Precision = eval_knn$byClass[3])
#evaluate logistic regression Model
eval_lr
# evaluate KNN Model
eval_knn

Conclusion

If we compare from two model in above, the result of high precision number is 90.16%. and this is more higher than Logistic regression.