library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(class)
available on the UCI Machine Learning repository
Attribute Information:
Descriptive Statistics:
(Minimum, Maximum, Mean, Std. Deviation) FRESH ( 3, 112151, 12000.30, 12647.329) MILK (55, 73498, 5796.27, 7380.377) GROCERY (3, 92780, 7951.28, 9503.163) FROZEN (25, 60869, 3071.93, 4854.673) DETERGENTS_PAPER (3, 40827, 2881.49, 4767.854) DELICATESSEN (3, 47943, 1524.87, 2820.106)
REGION Frequency Lisbon 77 Oporto 47 Other Region 316 Total 440
CHANNEL Frequency Horeca (Hotel Resto Cafe) 298 Retail 142 Total 440
wholesale <- read.csv("wholesale.csv")
str(wholesale)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(wholesale)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
head(wholesale)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
#table(wholesale$Channel)
#table(wholesale$Region)
#cor(wholesale)
#ggcorr(wholesale, label = T,label_size = 3)
Mengganti label 1 dan 2 pada Channel menjadi 0 dan 1. Dengan tujuan pada logistic regresinya menggunakan family = binomial.
1 = 0 = Horeca 2 = 1 = Retail
anyNA(wholesale)
## [1] FALSE
wholesale$Channel[wholesale$Channel == 1] <- 0
wholesale$Channel[wholesale$Channel == 2] <- 1
wholesale$Channel <- as.integer(wholesale$Channel)
head(wholesale)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 1 3 12669 9656 7561 214 2674 1338
## 2 1 3 7057 9810 9568 1762 3293 1776
## 3 1 3 6353 8808 7684 2405 3516 7844
## 4 0 3 13265 1196 4221 6404 507 1788
## 5 1 3 22615 5410 7198 3915 1777 5185
## 6 1 3 9413 8259 5126 666 1795 1451
#wholesale$Industry <- factor(wholesale$Channel, levels = c(1, 2), labels = c("horeca", "retail"))
#wholesale$Region <- factor(wholesale$Region, levels = c(1,2,3), labels = c("lisbon","oporto","other region"))
Data dibagi menjadi 80% traning dan 20% validasi.
set.seed(123)
clf.ws <- sample(nrow(wholesale), nrow(wholesale)*0.8)
channel.train <- wholesale[clf.ws, ]
channel.test <- wholesale[-clf.ws, ]
Berisi tentang membuat model log-reg dan membuat kolom baru untuk hasil prediksi model.
#logr.ws <- glm(Channel ~ Fresh + Milk + Grocery + Frozen + Detergents_Paper + Delicassen, channel.train, family = "binomial")
# (Detergents_Paper dan Grocery membuat error "glm.fit: fitted probabilities numerically 0 or 1 occurred")
# Kenapa ya?
logr.ws <- glm(Channel ~ Fresh + Milk + Frozen + Delicassen, channel.train, family = "binomial")
summary(logr.ws)
##
## Call:
## glm(formula = Channel ~ Fresh + Milk + Frozen + Delicassen, family = "binomial",
## data = channel.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9672 -0.5841 -0.3116 0.4382 3.5286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.530e+00 2.751e-01 -5.562 2.66e-08 ***
## Fresh -3.694e-05 1.610e-05 -2.293 0.0218 *
## Milk 3.693e-04 4.763e-05 7.754 8.91e-15 ***
## Frozen -3.543e-04 8.665e-05 -4.088 4.35e-05 ***
## Delicassen -9.172e-05 1.126e-04 -0.815 0.4152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 438.81 on 351 degrees of freedom
## Residual deviance: 265.03 on 347 degrees of freedom
## AIC: 275.03
##
## Number of Fisher Scoring iterations: 7
wholesale$pred.channel <- predict(logr.ws, channel.test, type = "response")
Berisi tahap-tahap pengerjaan dari k-NN. Mengambil k=21 dengan alasan secara umum, akar dari jumal datanya (440) akan (umumnya) menghasilkan k yang optimal.
channel.train.knn <- wholesale[clf.ws, ]
channel.test.knn <- wholesale[-clf.ws, ]
channel.train.labels <- wholesale[clf.ws, 1]
wholesale$knn.channel <- knn(train = data.frame(channel.train.knn),test = data.frame(channel.test.knn), cl = channel.train.labels, k=21)
Hasil dari Logistic Regression dan k-NN.
hist(wholesale$pred.channel) #hasil dari glm
table("predicted"=as.numeric(wholesale$pred.channel>=0.5), "actual"=wholesale$Channel) #Confussion matrix pada hasil log-reg
## actual
## predicted 0 1
## 0 215 95
## 1 83 47
table("predicted"=wholesale$knn.channel, "actual"=wholesale$Channel) #Conf-matrix pada hasil knn
## actual
## predicted 0 1
## 0 215 95
## 1 83 47
Kesimpulan yang didapat adalah dari log-reg:
Kesimpulan yang didapat adalah dari K-NN:
Kesimpulan dari log-reg vs K-NN: