library(dplyr)
library(class)
normalize <- function(x){
return (
(x - min(x))/(max(x) - min(x))
)
}
zscore <- function(x){
return(
(x-mean(x))/sd(x)
)
}Using dataset given by Algoritma Training, read the data and convert the target into factor
# Read the dataset in, drop the "Region" feature because it's not interesting
w <- read.csv("wholesale.csv", header=TRUE)
w <- w[,-2]
w.glm <- w
w$Industry <- factor(w$Channel, levels = c(1, 2), labels = c("horeca", "retail"))
w.glm$Industry <- factor(w$Channel, levels = c(1, 2), labels = c(0, 1))
w <- w[,-1]
w.glm <- w.glm[,-1]We set Data Train to be 80% of the whole data
set.seed(2902)
nwtrain <- sample(nrow(w), nrow(w)*0.8)
w.glm.train <- w.glm[nwtrain, ]
w.glm.test <- w.glm[-nwtrain, ]
w.mm <- as.data.frame(lapply(w[,-7], normalize))
w.train.knn.mm <- w.mm[nwtrain,]
w.test.knn.mm <- w.mm[-nwtrain,]
w.z <- as.data.frame(lapply(w[,-7], zscore))
w.train.knn.z <- w.z[nwtrain,]
w.test.knn.z <- w.z[-nwtrain,]
w.train.labels <- w[nwtrain, 7]
w.test.labels <- w[-nwtrain, 7]Create logistic model with no predictors and all predictors from the dataset
w.glm.none <- glm(Industry ~ 1, family = "binomial", data = w.glm.train)
w.glm.all <- glm(Industry ~ ., family = "binomial", data = w.glm.train)
summary(w.glm.none)##
## Call:
## glm(formula = Industry ~ 1, family = "binomial", data = w.glm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8942 -0.8942 -0.8942 1.4900 1.4900
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7102 0.1134 -6.264 3.76e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 446.23 on 351 degrees of freedom
## Residual deviance: 446.23 on 351 degrees of freedom
## AIC: 448.23
##
## Number of Fisher Scoring iterations: 4
summary(w.glm.all)##
## Call:
## glm(formula = Industry ~ ., family = "binomial", data = w.glm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6567 -0.3456 -0.2487 0.0802 3.2292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.370e+00 4.278e-01 -7.879 3.30e-15 ***
## Fresh 3.959e-06 1.693e-05 0.234 0.8151
## Milk 7.784e-05 5.779e-05 1.347 0.1780
## Grocery 1.081e-04 6.445e-05 1.677 0.0935 .
## Frozen -2.099e-04 1.042e-04 -2.015 0.0439 *
## Detergents_Paper 7.413e-04 1.392e-04 5.328 9.95e-08 ***
## Delicassen -2.670e-05 1.093e-04 -0.244 0.8070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 446.23 on 351 degrees of freedom
## Residual deviance: 176.49 on 345 degrees of freedom
## AIC: 190.49
##
## Number of Fisher Scoring iterations: 7
summary(step(w.glm.all, data=w.glm.train, direction="backward", trace = 0))##
## Call:
## glm(formula = Industry ~ Grocery + Frozen + Detergents_Paper,
## family = "binomial", data = w.glm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.73325 -0.34956 -0.25964 0.07586 3.04267
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.330e+00 4.062e-01 -8.198 2.45e-16 ***
## Grocery 1.523e-04 5.471e-05 2.783 0.00538 **
## Frozen -1.689e-04 7.954e-05 -2.123 0.03376 *
## Detergents_Paper 7.380e-04 1.380e-04 5.347 8.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 446.23 on 351 degrees of freedom
## Residual deviance: 178.39 on 348 degrees of freedom
## AIC: 186.39
##
## Number of Fisher Scoring iterations: 7
summary(step(w.glm.none, scope = list(upper=w.glm.all), data=w.glm.train, direction="forward",trace = 0))##
## Call:
## glm(formula = Industry ~ Detergents_Paper + Grocery + Frozen,
## family = "binomial", data = w.glm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.73325 -0.34956 -0.25964 0.07586 3.04267
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.330e+00 4.062e-01 -8.198 2.45e-16 ***
## Detergents_Paper 7.380e-04 1.380e-04 5.347 8.93e-08 ***
## Grocery 1.523e-04 5.471e-05 2.783 0.00538 **
## Frozen -1.689e-04 7.954e-05 -2.123 0.03376 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 446.23 on 351 degrees of freedom
## Residual deviance: 178.39 on 348 degrees of freedom
## AIC: 186.39
##
## Number of Fisher Scoring iterations: 7
summary(step(w.glm.all, scope = list(upper=w.glm.all), data=w.glm.train, direction="both",trace = 0))##
## Call:
## glm(formula = Industry ~ Grocery + Frozen + Detergents_Paper,
## family = "binomial", data = w.glm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.73325 -0.34956 -0.25964 0.07586 3.04267
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.330e+00 4.062e-01 -8.198 2.45e-16 ***
## Grocery 1.523e-04 5.471e-05 2.783 0.00538 **
## Frozen -1.689e-04 7.954e-05 -2.123 0.03376 *
## Detergents_Paper 7.380e-04 1.380e-04 5.347 8.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 446.23 on 351 degrees of freedom
## Residual deviance: 178.39 on 348 degrees of freedom
## AIC: 186.39
##
## Number of Fisher Scoring iterations: 7
summary(step(w.glm.none, scope = list(upper=w.glm.all), data=w.glm.train, direction="both",trace = 0))##
## Call:
## glm(formula = Industry ~ Detergents_Paper + Grocery + Frozen,
## family = "binomial", data = w.glm.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.73325 -0.34956 -0.25964 0.07586 3.04267
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.330e+00 4.062e-01 -8.198 2.45e-16 ***
## Detergents_Paper 7.380e-04 1.380e-04 5.347 8.93e-08 ***
## Grocery 1.523e-04 5.471e-05 2.783 0.00538 **
## Frozen -1.689e-04 7.954e-05 -2.123 0.03376 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 446.23 on 351 degrees of freedom
## Residual deviance: 178.39 on 348 degrees of freedom
## AIC: 186.39
##
## Number of Fisher Scoring iterations: 7
Checking from 4 different directions of stepwise, gave us the best AIC if we chose below variables to be included into the model:
Hence our Logarithmic Regression Model call will be
w.glm.adj <- glm(formula = Industry ~ Detergents_Paper + Grocery + Frozen,
family = "binomial", data = w.glm.train)w.glm.test$pred.Industry <- predict(w.glm.adj, w.glm.test, type = "response")
table("predicted"=as.numeric(w.glm.test$pred.Industry>=0.5), "actual"=w.glm.test$Industry)## actual
## predicted 0 1
## 0 60 3
## 1 2 23
I’m not really sure what’s really matters in this data model, so i’m going to calculate all 4 evaluation classifiers
paste("Accuracy: ", round(((60+23)/nrow(w.glm.test)) * 100,2), "%", sep="")## [1] "Accuracy: 94.32%"
paste("Recall: ", round((23/(23+3)) * 100,2), "%", sep="")## [1] "Recall: 88.46%"
paste("Precision: ", round((23/(23+2)) * 100,2), "%", sep="")## [1] "Precision: 92%"
paste("Specificity: ", round((60/(60+2)) * 100,2), "%", sep="")## [1] "Specificity: 96.77%"
The value shows that the model is very good
w.knn.mm <- knn(train = w.train.knn.mm, test=w.test.knn.mm, cl= w.train.labels, k=13)
table("predicted" = w.knn.mm, "actual" = w.test.labels)## actual
## predicted horeca retail
## horeca 60 2
## retail 2 24
Using the confusion Matrix, we could calculate the classifiers as follow
paste("Accuracy: ", round(((60+24)/nrow(w.test.knn.mm)) * 100,2), "%", sep="")## [1] "Accuracy: 95.45%"
paste("Recall: ", round((24/(24+2)) * 100,2), "%", sep="")## [1] "Recall: 92.31%"
paste("Precision: ", round((24/(24+2)) * 100,2), "%", sep="")## [1] "Precision: 92.31%"
paste("Specificity: ", round((60/(60+2)) * 100,2), "%", sep="")## [1] "Specificity: 96.77%"
w.knn.z <- knn(train = w.train.knn.z, test=w.test.knn.z, cl= w.train.labels, k=9)
table("predicted" = w.knn.z, "actual" = w.test.labels)## actual
## predicted horeca retail
## horeca 59 2
## retail 3 24
Using the confusion Matrix, we could calculate the classifiers as follow
paste("Accuracy: ", round(((59+24)/nrow(w.test.knn.z)) * 100,2), "%", sep="")## [1] "Accuracy: 94.32%"
paste("Recall: ", round((24/(24+2)) * 100,2), "%", sep="")## [1] "Recall: 92.31%"
paste("Precision: ", round((24/(24+3)) * 100,2), "%", sep="")## [1] "Precision: 88.89%"
paste("Specificity: ", round((59/(59+3)) * 100,2), "%", sep="")## [1] "Specificity: 95.16%"
For wholesale data, we get a better result when using min-max score normalization