1 Initialization

1.1 Library Call & Function Definition

library(dplyr)
library(class)
normalize <- function(x){
  return ( 
    (x - min(x))/(max(x) - min(x)) 
           )
}

zscore <- function(x){
  return(
    (x-mean(x))/sd(x)
  )
}

1.2 Read Data Source

Using dataset given by Algoritma Training, read the data and convert the target into factor

# Read the dataset in, drop the "Region" feature because it's not interesting
w <- read.csv("wholesale.csv", header=TRUE)
w <- w[,-2]
w.glm <- w
w$Industry <- factor(w$Channel, levels = c(1, 2), labels = c("horeca", "retail"))
w.glm$Industry <- factor(w$Channel, levels = c(1, 2), labels = c(0, 1))
w <- w[,-1]
w.glm <- w.glm[,-1]

1.3 Set Data Train & Data Test

We set Data Train to be 80% of the whole data

set.seed(2902)
nwtrain <- sample(nrow(w), nrow(w)*0.8)
w.glm.train <- w.glm[nwtrain, ]
w.glm.test <- w.glm[-nwtrain, ]
w.mm <- as.data.frame(lapply(w[,-7], normalize))
w.train.knn.mm <- w.mm[nwtrain,]
w.test.knn.mm <- w.mm[-nwtrain,]
w.z <- as.data.frame(lapply(w[,-7], zscore))
w.train.knn.z <- w.z[nwtrain,]
w.test.knn.z <- w.z[-nwtrain,]

w.train.labels <- w[nwtrain, 7]
w.test.labels <- w[-nwtrain, 7]

2 Logistic Regression Model

Create logistic model with no predictors and all predictors from the dataset

w.glm.none <- glm(Industry ~ 1, family = "binomial", data = w.glm.train)
w.glm.all <- glm(Industry ~ ., family = "binomial", data = w.glm.train)
summary(w.glm.none)
## 
## Call:
## glm(formula = Industry ~ 1, family = "binomial", data = w.glm.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8942  -0.8942  -0.8942   1.4900   1.4900  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.7102     0.1134  -6.264 3.76e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 446.23  on 351  degrees of freedom
## Residual deviance: 446.23  on 351  degrees of freedom
## AIC: 448.23
## 
## Number of Fisher Scoring iterations: 4
summary(w.glm.all)
## 
## Call:
## glm(formula = Industry ~ ., family = "binomial", data = w.glm.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6567  -0.3456  -0.2487   0.0802   3.2292  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.370e+00  4.278e-01  -7.879 3.30e-15 ***
## Fresh             3.959e-06  1.693e-05   0.234   0.8151    
## Milk              7.784e-05  5.779e-05   1.347   0.1780    
## Grocery           1.081e-04  6.445e-05   1.677   0.0935 .  
## Frozen           -2.099e-04  1.042e-04  -2.015   0.0439 *  
## Detergents_Paper  7.413e-04  1.392e-04   5.328 9.95e-08 ***
## Delicassen       -2.670e-05  1.093e-04  -0.244   0.8070    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 446.23  on 351  degrees of freedom
## Residual deviance: 176.49  on 345  degrees of freedom
## AIC: 190.49
## 
## Number of Fisher Scoring iterations: 7

2.1 Doing stepwise check

2.1.1 Backward Direction Step

summary(step(w.glm.all, data=w.glm.train, direction="backward", trace = 0))
## 
## Call:
## glm(formula = Industry ~ Grocery + Frozen + Detergents_Paper, 
##     family = "binomial", data = w.glm.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.73325  -0.34956  -0.25964   0.07586   3.04267  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.330e+00  4.062e-01  -8.198 2.45e-16 ***
## Grocery           1.523e-04  5.471e-05   2.783  0.00538 ** 
## Frozen           -1.689e-04  7.954e-05  -2.123  0.03376 *  
## Detergents_Paper  7.380e-04  1.380e-04   5.347 8.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 446.23  on 351  degrees of freedom
## Residual deviance: 178.39  on 348  degrees of freedom
## AIC: 186.39
## 
## Number of Fisher Scoring iterations: 7

  

2.1.2 Forward Direction Step

summary(step(w.glm.none, scope = list(upper=w.glm.all), data=w.glm.train, direction="forward",trace = 0))
## 
## Call:
## glm(formula = Industry ~ Detergents_Paper + Grocery + Frozen, 
##     family = "binomial", data = w.glm.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.73325  -0.34956  -0.25964   0.07586   3.04267  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.330e+00  4.062e-01  -8.198 2.45e-16 ***
## Detergents_Paper  7.380e-04  1.380e-04   5.347 8.93e-08 ***
## Grocery           1.523e-04  5.471e-05   2.783  0.00538 ** 
## Frozen           -1.689e-04  7.954e-05  -2.123  0.03376 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 446.23  on 351  degrees of freedom
## Residual deviance: 178.39  on 348  degrees of freedom
## AIC: 186.39
## 
## Number of Fisher Scoring iterations: 7

  

2.1.3 Bi-Direction Step

2.1.3.1 From Upper Level

summary(step(w.glm.all, scope = list(upper=w.glm.all), data=w.glm.train, direction="both",trace = 0))
## 
## Call:
## glm(formula = Industry ~ Grocery + Frozen + Detergents_Paper, 
##     family = "binomial", data = w.glm.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.73325  -0.34956  -0.25964   0.07586   3.04267  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.330e+00  4.062e-01  -8.198 2.45e-16 ***
## Grocery           1.523e-04  5.471e-05   2.783  0.00538 ** 
## Frozen           -1.689e-04  7.954e-05  -2.123  0.03376 *  
## Detergents_Paper  7.380e-04  1.380e-04   5.347 8.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 446.23  on 351  degrees of freedom
## Residual deviance: 178.39  on 348  degrees of freedom
## AIC: 186.39
## 
## Number of Fisher Scoring iterations: 7

  

2.1.3.2 From Lower Level

summary(step(w.glm.none, scope = list(upper=w.glm.all), data=w.glm.train, direction="both",trace = 0))
## 
## Call:
## glm(formula = Industry ~ Detergents_Paper + Grocery + Frozen, 
##     family = "binomial", data = w.glm.train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.73325  -0.34956  -0.25964   0.07586   3.04267  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.330e+00  4.062e-01  -8.198 2.45e-16 ***
## Detergents_Paper  7.380e-04  1.380e-04   5.347 8.93e-08 ***
## Grocery           1.523e-04  5.471e-05   2.783  0.00538 ** 
## Frozen           -1.689e-04  7.954e-05  -2.123  0.03376 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 446.23  on 351  degrees of freedom
## Residual deviance: 178.39  on 348  degrees of freedom
## AIC: 186.39
## 
## Number of Fisher Scoring iterations: 7

  

2.1.4 Summary

Checking from 4 different directions of stepwise, gave us the best AIC if we chose below variables to be included into the model:

  • Detergents_Paper
  • Grocery
  • Frozen

Hence our Logarithmic Regression Model call will be

w.glm.adj <- glm(formula = Industry ~ Detergents_Paper + Grocery + Frozen, 
     family = "binomial", data = w.glm.train)

2.2 Cross Validation - Confusion Matrix

w.glm.test$pred.Industry <- predict(w.glm.adj, w.glm.test, type = "response")
table("predicted"=as.numeric(w.glm.test$pred.Industry>=0.5), "actual"=w.glm.test$Industry)
##          actual
## predicted  0  1
##         0 60  3
##         1  2 23

2.3 Summary

I’m not really sure what’s really matters in this data model, so i’m going to calculate all 4 evaluation classifiers

paste("Accuracy: ", round(((60+23)/nrow(w.glm.test)) * 100,2), "%", sep="")
## [1] "Accuracy: 94.32%"
paste("Recall: ", round((23/(23+3)) * 100,2), "%", sep="")
## [1] "Recall: 88.46%"
paste("Precision: ", round((23/(23+2)) * 100,2), "%", sep="")
## [1] "Precision: 92%"
paste("Specificity: ", round((60/(60+2)) * 100,2), "%", sep="")
## [1] "Specificity: 96.77%"

The value shows that the model is very good

3 Nearest Neighbor Algorithm

3.1 Min-Max Normalization

w.knn.mm <- knn(train = w.train.knn.mm, test=w.test.knn.mm, cl= w.train.labels, k=13)
table("predicted" = w.knn.mm, "actual" = w.test.labels)
##          actual
## predicted horeca retail
##    horeca     60      2
##    retail      2     24

Using the confusion Matrix, we could calculate the classifiers as follow

paste("Accuracy: ", round(((60+24)/nrow(w.test.knn.mm)) * 100,2), "%", sep="")
## [1] "Accuracy: 95.45%"
paste("Recall: ", round((24/(24+2)) * 100,2), "%", sep="")
## [1] "Recall: 92.31%"
paste("Precision: ", round((24/(24+2)) * 100,2), "%", sep="")
## [1] "Precision: 92.31%"
paste("Specificity: ", round((60/(60+2)) * 100,2), "%", sep="")
## [1] "Specificity: 96.77%"

3.2 zScore Normalization

w.knn.z <- knn(train = w.train.knn.z, test=w.test.knn.z, cl= w.train.labels, k=9)
table("predicted" = w.knn.z, "actual" = w.test.labels)
##          actual
## predicted horeca retail
##    horeca     59      2
##    retail      3     24

Using the confusion Matrix, we could calculate the classifiers as follow

paste("Accuracy: ", round(((59+24)/nrow(w.test.knn.z)) * 100,2), "%", sep="")
## [1] "Accuracy: 94.32%"
paste("Recall: ", round((24/(24+2)) * 100,2), "%", sep="")
## [1] "Recall: 92.31%"
paste("Precision: ", round((24/(24+3)) * 100,2), "%", sep="")
## [1] "Precision: 88.89%"
paste("Specificity: ", round((59/(59+3)) * 100,2), "%", sep="")
## [1] "Specificity: 95.16%"

3.3 Summary

For wholesale data, we get a better result when using min-max score normalization