First, let’s set up cathing for this notebook and load pacman package for loading libraries conveniently.

knitr::opts_chunk$set(cache = TRUE)
options(scipen = 9999)

if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman

Then, load the data.

p_load(dplyr)
wholesale <- read.csv("data_input/wholesale.csv") %>%
  select(-2) # remove region feature

str(wholesale)
## 'data.frame':    440 obs. of  7 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

Here, first we proceed with KNN method. Let’s convert Channel feature into Industry, and make it factor.

wholesale.k <- wholesale
wholesale.k <- wholesale %>%
  mutate(Industry = factor(Channel, levels = c(1, 2), labels = c("horeca", "retail"))) %>%
  select(-1) # remove the original Channel feature

wholesale.k %>% head()
##   Fresh Milk Grocery Frozen Detergents_Paper Delicassen Industry
## 1 12669 9656    7561    214             2674       1338   retail
## 2  7057 9810    9568   1762             3293       1776   retail
## 3  6353 8808    7684   2405             3516       7844   retail
## 4 13265 1196    4221   6404              507       1788   horeca
## 5 22615 5410    7198   3915             1777       5185   retail
## 6  9413 8259    5126    666             1795       1451   retail

Check the proportion of the Industry feature.

round(prop.table(table(wholesale.k$Industry)) * 100, 2)
## 
## horeca retail 
##  67.73  32.27

Normalize the data using z-score normalization.

wholesale.z <- as.data.frame(scale(wholesale.k[, -7]))
summary(wholesale.z)
##      Fresh              Milk            Grocery            Frozen        
##  Min.   :-0.9486   Min.   :-0.7779   Min.   :-0.8364   Min.   :-0.62763  
##  1st Qu.:-0.7015   1st Qu.:-0.5776   1st Qu.:-0.6101   1st Qu.:-0.47988  
##  Median :-0.2764   Median :-0.2939   Median :-0.3363   Median :-0.31844  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.3901   3rd Qu.: 0.1889   3rd Qu.: 0.2846   3rd Qu.: 0.09935  
##  Max.   : 7.9187   Max.   : 9.1732   Max.   : 8.9264   Max.   :11.90545  
##  Detergents_Paper    Delicassen     
##  Min.   :-0.6037   Min.   :-0.5396  
##  1st Qu.:-0.5505   1st Qu.:-0.3960  
##  Median :-0.4331   Median :-0.1984  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.2182   3rd Qu.: 0.1047  
##  Max.   : 7.9586   Max.   :16.4597

Combine the columns again.

wholesale.n <- as.data.frame(cbind(wholesale.z, "Industry" = wholesale.k$Industry))
summary(wholesale.n)
##      Fresh              Milk            Grocery            Frozen        
##  Min.   :-0.9486   Min.   :-0.7779   Min.   :-0.8364   Min.   :-0.62763  
##  1st Qu.:-0.7015   1st Qu.:-0.5776   1st Qu.:-0.6101   1st Qu.:-0.47988  
##  Median :-0.2764   Median :-0.2939   Median :-0.3363   Median :-0.31844  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.3901   3rd Qu.: 0.1889   3rd Qu.: 0.2846   3rd Qu.: 0.09935  
##  Max.   : 7.9187   Max.   : 9.1732   Max.   : 8.9264   Max.   :11.90545  
##  Detergents_Paper    Delicassen        Industry  
##  Min.   :-0.6037   Min.   :-0.5396   horeca:298  
##  1st Qu.:-0.5505   1st Qu.:-0.3960   retail:142  
##  Median :-0.4331   Median :-0.1984               
##  Mean   : 0.0000   Mean   : 0.0000               
##  3rd Qu.: 0.2182   3rd Qu.: 0.1047               
##  Max.   : 7.9586   Max.   :16.4597

Divide to train and test set.

set.seed(2)

index_sample <- sample(nrow(wholesale.n))
wholesale_train <- wholesale.n[index_sample, ]
wholesale_test <- wholesale.n[index_sample, ]

prop.table(table(wholesale_train$Industry))
## 
##    horeca    retail 
## 0.6772727 0.3227273
prop.table(table(wholesale_test$Industry))
## 
##    horeca    retail 
## 0.6772727 0.3227273

Create accuracy function.

accuracy <- function(data) {
  # initialize number of predictions
  correct <- 0
  for (i in c(1:nrow(data))) {
    # 7th variable is actual class, 8th is our predicted class
    if (data[i, 7] == data[i, 8]) {
      correct <- correct + 1
    }
  }
  percentage <- correct / nrow(data) * 100
  return(percentage)
}

Finally create knn prediction, and check the accuracy.

p_load(class)
predictions <- knn(train = wholesale_train[, 1:6], test = wholesale_test[, 1:6], cl = wholesale_train[, 7], k = 19)

wholesale_test[, 8] <- predictions
print(accuracy(wholesale_test))
## [1] 90.90909

Let’s try another method to predict, which is logistic regression. We have to assure there is no multicollinearity between all variables. Here I use chart.Correlation function to see the correlation between variables.

p_load(PerformanceAnalytics)
wholesale.n[, 7] <- as.numeric(wholesale.n[, 7])
chart.Correlation(wholesale.n)

It looks like Grocery has great correlation with Detergents_Paper and less affecting to Industrythan Detergents_Paper. So, let’s remove it, and proceed step-wise regression for feature selection with backward elimination method to find the best predictors for the model.

lm.all <- glm(Industry ~ Fresh + Milk + Frozen + Detergents_Paper + Delicassen, data = wholesale_train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
step(lm.all, direction = "backward")
## Start:  AIC=220.15
## Industry ~ Fresh + Milk + Frozen + Detergents_Paper + Delicassen
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - Delicassen        1   208.17 218.17
## - Fresh             1   208.40 218.40
## <none>                  208.15 220.15
## - Frozen            1   214.11 224.11
## - Milk              1   214.48 224.48
## - Detergents_Paper  1   348.67 358.67
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=218.17
## Industry ~ Fresh + Milk + Frozen + Detergents_Paper
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## - Fresh             1   208.40 216.40
## <none>                  208.17 218.17
## - Milk              1   214.69 222.69
## - Frozen            1   215.54 223.54
## - Detergents_Paper  1   348.74 356.74
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=216.4
## Industry ~ Milk + Frozen + Detergents_Paper
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                    Df Deviance    AIC
## <none>                  208.40 216.40
## - Milk              1   215.38 221.38
## - Frozen            1   215.76 221.76
## - Detergents_Paper  1   351.18 357.18
## 
## Call:  glm(formula = Industry ~ Milk + Frozen + Detergents_Paper, family = "binomial", 
##     data = wholesale_train)
## 
## Coefficients:
##      (Intercept)              Milk            Frozen  Detergents_Paper  
##          -0.4646            0.9042           -0.8845            4.6254  
## 
## Degrees of Freedom: 439 Total (i.e. Null);  436 Residual
## Null Deviance:       553.4 
## Residual Deviance: 208.4     AIC: 216.4
wholesale.logit <- glm(
  formula = Industry ~ Milk + Frozen + Detergents_Paper,
  family = "binomial", data = wholesale_train
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

After that, let’s check again the multicollinearity using vif function.

p_load(car)
vif(wholesale.logit)
##             Milk           Frozen Detergents_Paper 
##         1.182963         1.135161         1.077362

It results small values. So, let’s predict the test set.

wholesale_test$pred.Risk <- predict(wholesale.logit, wholesale_test, type = "response")

Create accuracy function.

accuracy_log <- function(data) {
  # initialize number of predictions
  correct <- 0
  for (i in c(1:nrow(data))) {
    # 7th variable is actual class, 8th is our predicted class
    if (data[i, 10] == data[i, 11]) {
      correct <- correct + 1
    }
  }
  percentage <- correct / nrow(data) * 100
  return(percentage)
}

And finally, we find the accuracy.

wholesale_test$pred_log <- as.numeric(wholesale_test$pred.Risk >= 0.3)
wholesale_test$actual <- as.numeric(wholesale_test$Industry == "retail")
accuracy_log(wholesale_test)
## [1] 91.36364

The accuracy of logistic regression (91.36) is slighly better the accuracy of KNN (90.91). It’s likely happen after we remove some variable to predict with logistic regression. The accuracy probably results differently, if we also use only some variables to predict with KNN.

Using KNN, we don’t need to assure the assumption, such as no multicollinearity among variables, and linearity of variables. In logistic regression process, we need to see the variable correlation first, and using step-wise regression, also VIF.

Compare to KNN, logistic regression don’t need to scaling the data, and it also works if there are non-numeric variables.