The goal of this exercise is to understand unsupervised learning in the context of Dimensionality Reduction and clustering. Dimensionality Reduction has the ability to reduce dimensions/variables while maintaining necessary and crucial information.
Ideally, this type of exercise requires data with a large number of
dimensions. Spotify track data in
kaggle.com is considered one good example. However, the PCA
process takes too long (and need a more powerful computer, well I dont
have one). We will try it next time using google cloud
(maybe).
Read data
library(dplyr)
penjualan <- read.csv("wholesale.csv")
dim(penjualan)## [1] 440 8
str(penjualan)## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
head(penjualan)names(penjualan)## [1] "Channel" "Region" "Fresh" "Milk"
## [5] "Grocery" "Frozen" "Detergents_Paper" "Delicassen"
Removing columns: NA
Modifying column types:
from chr to factor: NA
from int to factor: Channel,
Region
# change type of data
penjualan_clean <- penjualan %>%
mutate(Channel = as.factor(Channel),
Region= as.factor(Region))
head(penjualan_clean)Check missing value
anyNA(penjualan_clean)## [1] FALSE
is.na(penjualan_clean)%>% colSums()## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
pre-PCA correlation check
library(GGally)
ggcorr(penjualan_clean, label = T, hjust = 1, layout.exp = 2)## Warning in ggcorr(penjualan_clean, label = T, hjust = 1, layout.exp = 2): data
## in column(s) 'Channel', 'Region' are not numeric and were ignored
Dynamically choose numerical and categorical columns
# numerical columns (quantitative)
quanti <- penjualan_clean %>%
select_if(is.numeric) %>%
colnames()
quanti## [1] "Fresh" "Milk" "Grocery" "Frozen"
## [5] "Detergents_Paper" "Delicassen"
# index of numerical columns
quantivar <- which(colnames(penjualan_clean) %in% quanti)
quantivar## [1] 3 4 5 6 7 8
# categorical columns (qualitative)
quali <- penjualan_clean %>%
select_if(is.factor) %>%
colnames()
quali## [1] "Channel" "Region"
# index of categorical columns
qualivar <- which(colnames(penjualan_clean) %in% quali)
qualivar## [1] 1 2
FactoMineRlibrary(FactoMineR)# PCA with FactoMineR
penjualan_pca <- PCA(X = penjualan_clean,
scale.unit = T,
quali.sup = qualivar,
graph = F,
ncp = 6) # 10 kolom numerik
penjualan_pca## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 440 individuals, described by 8 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$quali.sup" "results for the supplementary categorical variables"
## 12 "$quali.sup$coord" "coord. for the supplementary categories"
## 13 "$quali.sup$v.test" "v-test of the supplementary categories"
## 14 "$call" "summary statistics"
## 15 "$call$centre" "mean of the variables"
## 16 "$call$ecart.type" "standard error of the variables"
## 17 "$call$row.w" "weights for the individuals"
## 18 "$call$col.w" "weights for the variables"
# ekuivalen with prcomp(data, scale. = T)# summary(prcomp_pca)
penjualan_pca$eig## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.64497357 44.082893 44.08289
## comp 2 1.70258397 28.376400 72.45929
## comp 3 0.74006477 12.334413 84.79371
## comp 4 0.56373023 9.395504 94.18921
## comp 5 0.28567634 4.761272 98.95048
## comp 6 0.06297111 1.049519 100.00000
# cek nilai di tiap PC (proyeksi objek di sumbu pc yang baru)
# ekuivalen dengan pca$x
head(penjualan_pca$ind$coord)## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## 1 0.1932905 -0.3051000 0.14087845 -0.4864315 -0.4952815 0.007414139
## 2 0.4344199 -0.3284126 -0.31900662 -0.1788304 -0.3655793 -0.054571846
## 3 0.8111432 0.8150957 -1.52341562 -1.2540815 0.3790535 0.277538576
## 4 -0.7786478 0.6527537 -0.16301227 0.3800601 0.2761376 -0.060717538
## 5 0.1662873 1.2714337 -0.06627939 -0.8262267 0.3942107 0.026824641
## 6 -0.1561699 -0.2951410 -0.14761194 -0.4182883 -0.4794549 0.053940320
#ekuivalen dengan eigen vector atau pca$rotation
penjualan_pca$var$coord## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Fresh 0.06974378 0.6888622 0.69876010 -0.17770820 0.02602031
## Milk 0.88654635 0.1085197 -0.05194998 -0.06546390 -0.44179588
## Grocery 0.94206631 -0.1906204 0.09323953 0.07957747 0.16836310
## Frozen 0.08324992 0.7976140 -0.15346028 0.57714209 0.01492942
## Detergents_Paper 0.89227411 -0.3330361 0.11716213 0.12894882 0.18153322
## Delicassen 0.40444082 0.6579050 -0.45069909 -0.41450107 0.16820333
## Dim.6
## Fresh 0.009040229
## Milk 0.009545823
## Grocery -0.181114915
## Frozen 0.003923994
## Detergents_Paper 0.172118487
## Delicassen 0.018854190
Using plot.PCA():
# individual factor map
plot.PCA(
x = penjualan_pca,
choix = "ind",
invisible = "quali",
select = "contrib 5",
habillage = "Channel"
)# individual factor map
plot.PCA(
x = penjualan_pca,
choix = "ind",
invisible = "quali",
select = "contrib 5",
habillage = "Region"
)individual plot able to identify outliers
# variables factor map
plot.PCA(
x = penjualan_pca,
choix = "var"
)Note: percentage of Dim 1 (44.08%) and Dim 2 (28.38%) shows the ability of these axis in presenting the information (summary).
K-means algoritm is centroid-based clustering, it means each cluster has one centroid represents that particular cluster.
RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
# k-means dengan 3 cluster
penjualan_km <- kmeans(x = penjualan_clean,
centers = 3)penjualan_km$iter## [1] 3
penjualan_km$size## [1] 50 330 60
penjualan_km$centers## Channel Region Fresh Milk Grocery Frozen Detergents_Paper
## 1 1.960000 2.440000 8000.04 18511.420 27573.900 1996.680 12407.360
## 2 1.260606 2.554545 8253.47 3824.603 5280.455 2572.661 1773.058
## 3 1.133333 2.566667 35941.40 6044.450 6288.617 6713.967 1039.667
## Delicassen
## 1 2252.020
## 2 1137.497
## 3 3049.467
4.Cluster label for each observation
head(penjualan_km$cluster)## [1] 2 2 2 2 3 2
The goodness of the clustering results can be seen from the 3 values:
$withinss): jumlah jarak kuadrat
dari tiap observasi ke centroid tiap cluster.$betweenss): jumlah jarak
kuadrat terbobot dari tiap centroid ke rata-rata global. Bobotnya
berdasarkan banyaknya observasi pada cluster.$totss): jumlah jarak kuadrat
dari tiap observasi ke rata-rata global.# cek nilai WSS
penjualan_km$withinss## [1] 26382784712 28184319111 25765310355
sum(penjualan_km$withinss)## [1] 80332414178
penjualan_km$tot.withinss## [1] 80332414178
# cek rasio BSS/TSS
penjualan_km$betweenss## [1] 77263443347
penjualan_km$totss## [1] 157595857525
penjualan_km$betweenss / penjualan_km$totss## [1] 0.4902632
The ratio value between BSS/TSS = Our cluster is not optimal
The criteria of “good” clustering:
The higher k value:
however, a high k is not necessarily the best, because it could be that 1 cluster contains very little data, therefore, it is not meaningful
Kalau begitu apakah kita selalu memilih k = banyak observasi? Bagaimana menentukan k optimum?
fviz_nbclust() dari package factoextrafviz_nbclust(x = penjualan_clean,
FUNcluster = kmeans, # fungsi clustering
method = "wss") # within sum of squarePilih nilai k di mana ketika k ditambah, penurunan Total WSS tidak terlalu drastis (atau dapat dikatakan sudah melandai).
Nilai k optimum dari data kita adalah k = 5. (yang pertama paling landai)
Buat ulang k-means dengan k optimum:
RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
# berdasarkan elbow method
penjualan_km_opt <- kmeans(x = penjualan_clean,
centers = 5)
head(penjualan_km_opt$cluster)## [1] 1 1 1 1 2 1
# membuat kolom baru berisi label cluster
penjualan_clean$kelompok <- as.factor(penjualan_km_opt$cluster)
# melakukan profiling cluster
penjualan_centroid <- penjualan_clean %>%
group_by(kelompok) %>%
summarise_all(mean)## Warning in mean.default(Channel): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Channel): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Channel): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Channel): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Channel): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Region): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Region): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Region): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Region): argument is not numeric or logical: returning
## NA
## Warning in mean.default(Region): argument is not numeric or logical: returning
## NA
# Sama saja dengan penjualan_km_opt$centers
penjualan_centroid# mempermudah profiling
library(ggiraphExtra)## Warning: package 'ggiraphExtra' was built under R version 4.2.2
ggRadar(data=penjualan_clean,
aes(colour=kelompok),
interactive=TRUE)Misal ada seorang pelanggan pecinta barang dari list nomor
1 datang ke toko kita, namun stok barang tersebut sedang
kosong. Kira-kira barang apa yang akan kita rekomendasikan?. Sayangnya
untuk data wholesale.csv ini tidak ada label nama di setiap
baris, sehingga agak sulit diaplikasikan.
#penjualan_cleanpenjualan_clean["1",]penjualan_clean["1", "kelompok"]## [1] 1
## Levels: 1 2 3 4 5
penjualan_clean[penjualan_clean$kelompok == 1, ]berdasarkan data di atas, maka kita bisa merekomendasikan barang-barang yang mirip sebagaimana list di atas, yaitu berada di cluster yang sama, yaitu kelompok 1
Tujuan: membandingkan hasil clustering
k_means dengan logistic regression
library(dplyr)
penjualan_klasifikasi <- read.csv("wholesale.csv")
head(penjualan_klasifikasi)# mengubah type data
penjualan__klasifikasi_clean <- penjualan_klasifikasi %>%
mutate(Channel = as.factor(Channel),
Region= as.factor(Region))
head(penjualan__klasifikasi_clean)anyNA(penjualan__klasifikasi_clean)## [1] FALSE
is.na(penjualan__klasifikasi_clean)%>% colSums()## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
Check class imbalance
table(penjualan__klasifikasi_clean$Channel)##
## 1 2
## 298 142
prop.table(table(penjualan__klasifikasi_clean$Channel))##
## 1 2
## 0.6772727 0.3227273
terdapat class imbalance pada kolom target
kaggle.com, namun ternyata memakan waktu yang sangat
lama pada proses PCA, karena data terdiri atas ratusan ribu baris.
Setelah submit LBB ini, saya akan coba lagi PCA dengan jumlah data yang
lebih sedikit