library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(FactoMineR)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
options(scipen = 999)
#Data Exploration
coffee <- read.csv("data/coffee.csv")
str(coffee)
## 'data.frame': 1082 obs. of 13 variables:
## $ coffeeId : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Aroma : num 7.83 8 7.92 8 8.33 8 7.67 7.67 7.67 7.67 ...
## $ Flavor : num 8.08 7.75 7.83 7.92 7.83 7.92 7.75 7.75 7.75 7.83 ...
## $ Aftertaste : num 7.75 7.92 7.92 7.92 7.83 7.67 7.83 7.83 7.58 7.83 ...
## $ Acidity : num 7.92 8 8 7.75 7.75 8 7.83 7.67 7.83 7.83 ...
## $ Body : num 8.25 7.92 7.83 7.83 8.25 7.75 7.92 7.92 7.83 7.92 ...
## $ Balance : num 7.92 7.92 7.92 7.75 7.75 7.92 7.75 7.83 8 7.75 ...
## $ Uniformity : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Clean.Cup : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Sweetness : num 8 8 7.83 7.75 7.58 7.75 8 7.92 7.92 7.75 ...
## $ Cupper.Points: num 8 8 8 8.08 7.67 7.75 7.83 7.92 7.92 7.83 ...
## $ Moisture : num 0.12 0 0 0.12 0.12 0 0 0.1 0.09 0.12 ...
## $ Quakers : int 0 0 0 0 0 0 0 0 0 0 ...
#1. Principal Component Analysis (PCA)
#Data Pre-Processing
coffee_scale <- scale(coffee)
#Build Principal Component
pca_coffee <- PCA(coffee_scale, scale.unit = F)
summary(pca_coffee)
##
## Call:
## PCA(X = coffee_scale, scale.unit = F)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Variance 6.938 1.443 0.996 0.941 0.630 0.474
## % of var. 53.418 11.114 7.670 7.244 4.853 3.653
## Cumulative % of var. 53.418 64.531 72.202 79.446 84.299 87.951
## Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12
## Variance 0.353 0.313 0.247 0.230 0.175 0.155
## % of var. 2.716 2.406 1.902 1.771 1.346 1.192
## Cumulative % of var. 90.668 93.074 94.976 96.746 98.092 99.284
## Dim.13
## Variance 0.093
## % of var. 0.716
## Cumulative % of var. 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 4.858 | 2.793 0.104 0.331 | -2.668 0.456 0.302 |
## 2 | 5.010 | 2.743 0.100 0.300 | -3.450 0.762 0.474 |
## 3 | 5.123 | 2.623 0.092 0.262 | -3.604 0.832 0.495 |
## 4 | 4.770 | 2.278 0.069 0.228 | -2.830 0.513 0.352 |
## 5 | 5.331 | 2.434 0.079 0.208 | -2.997 0.575 0.316 |
## 6 | 5.049 | 2.281 0.069 0.204 | -3.586 0.823 0.504 |
## 7 | 4.597 | 1.978 0.052 0.185 | -3.284 0.691 0.510 |
## 8 | 4.232 | 1.801 0.043 0.181 | -2.679 0.460 0.401 |
## 9 | 4.208 | 1.792 0.043 0.181 | -2.740 0.481 0.424 |
## 10 | 4.503 | 1.805 0.043 0.161 | -2.717 0.473 0.364 |
## Dim.3 ctr cos2
## 1 0.367 0.012 0.006 |
## 2 0.076 0.001 0.000 |
## 3 0.106 0.001 0.000 |
## 4 0.432 0.017 0.008 |
## 5 0.446 0.018 0.007 |
## 6 0.109 0.001 0.000 |
## 7 0.050 0.000 0.000 |
## 8 0.315 0.009 0.006 |
## 9 0.280 0.007 0.004 |
## 10 0.392 0.014 0.008 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## coffeeId | -0.746 8.025 0.557 | 0.384 10.215 0.148 | -0.083
## Aroma | 0.855 10.542 0.732 | -0.066 0.303 0.004 | 0.021
## Flavor | 0.940 12.742 0.885 | -0.075 0.388 0.006 | 0.018
## Aftertaste | 0.933 12.536 0.871 | -0.087 0.520 0.008 | 0.013
## Acidity | 0.874 11.012 0.765 | -0.083 0.473 0.007 | -0.004
## Body | 0.854 10.518 0.730 | -0.084 0.490 0.007 | -0.007
## Balance | 0.890 11.416 0.793 | -0.076 0.401 0.006 | -0.005
## Uniformity | 0.596 5.122 0.356 | 0.519 18.670 0.270 | -0.051
## Clean.Cup | 0.534 4.107 0.285 | 0.523 18.952 0.274 | -0.067
## Sweetness | 0.412 2.445 0.170 | 0.733 37.229 0.538 | -0.104
## ctr cos2
## coffeeId 0.687 0.007 |
## Aroma 0.045 0.000 |
## Flavor 0.031 0.000 |
## Aftertaste 0.018 0.000 |
## Acidity 0.001 0.000 |
## Body 0.005 0.000 |
## Balance 0.003 0.000 |
## Uniformity 0.261 0.003 |
## Clean.Cup 0.455 0.005 |
## Sweetness 1.077 0.011 |
plot.PCA(pca_coffee, choix = "var")
a <- dimdesc(pca_coffee)
a_pc1 <- a[[1]]$quanti
as.data.frame(round(a_pc1,4))
## correlation p.value
## Flavor 0.9407 0
## Aftertaste 0.9330 0
## Balance 0.8904 0
## Cupper.Points 0.8776 0
## Acidity 0.8745 0
## Aroma 0.8556 0
## Body 0.8546 0
## Uniformity 0.5964 0
## Clean.Cup 0.5340 0
## Sweetness 0.4121 0
## Moisture -0.1752 0
## coffeeId -0.7465 0
#2. K-Means Clustering # 2.1 Choosing Optimum K
kmeansTunning <- function(data, maxK) {
withinall <- NULL
total_k <- NULL
for (i in 2:maxK) {
set.seed(654)
temp <- kmeans(data,i)$tot.withinss
withinall <- append(withinall, temp)
total_k <- append(total_k,i)
}
plot(x = total_k, y = withinall, type = "o", xlab = "Number of Cluster", ylab = "Total within")
}
# kmeansTunning(your_data, maxK = 10)
kmeansTunning(coffee_scale, maxK = 10)
set.seed(654)
coffee_scale <- coffee_scale[-c(1080,1081),]
coffee_clas <- kmeans(coffee_scale, centers = 5)
set.seed(654)
fviz_cluster(coffee_clas, coffee_scale, habillage = 5, addEllipses = T)
#mencari kluster coffee 929
coffee_scale <- as.data.frame(coffee_scale)
coffee_scale$cluster <- coffee_clas$cluster
coffee_scale[c(929), "cluster"]
## [1] 2
coffee_scale[c(1021,21,1060), "cluster"]
## [1] 2 5 2
coffee_scale[c(218), "cluster"]
## [1] 3
coffee_scale %>%
mutate(cluster = coffee_clas$cluster) %>%
group_by(cluster) %>%
summarise_if(.predicate = is.numeric, .funs = mean)
## # A tibble: 5 x 14
## cluster coffeeId Aroma Flavor Aftertaste Acidity Body Balance
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.150 -0.0180 0.0129 -0.0172 -0.0533 -0.0735 -0.0453
## 2 2 1.36 -0.859 -1.08 -1.07 -0.832 -0.811 -1.03
## 3 3 -1.24 0.778 0.889 0.887 0.848 0.764 0.828
## 4 4 1.73 -19.7 -18.8 -18.2 -19.5 -20.5 -18.0
## 5 5 -0.273 0.0687 0.104 0.181 0.0309 0.204 0.309
## # ... with 6 more variables: Uniformity <dbl>, Clean.Cup <dbl>,
## # Sweetness <dbl>, Cupper.Points <dbl>, Moisture <dbl>, Quakers <dbl>