Quiz UL

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(FactoMineR)
library(factoextra)

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

options(scipen = 999)

#Data Exploration
coffee <- read.csv("data/coffee.csv")
str(coffee)

## 'data.frame':    1082 obs. of  13 variables:
##  $ coffeeId     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Aroma        : num  7.83 8 7.92 8 8.33 8 7.67 7.67 7.67 7.67 ...
##  $ Flavor       : num  8.08 7.75 7.83 7.92 7.83 7.92 7.75 7.75 7.75 7.83 ...
##  $ Aftertaste   : num  7.75 7.92 7.92 7.92 7.83 7.67 7.83 7.83 7.58 7.83 ...
##  $ Acidity      : num  7.92 8 8 7.75 7.75 8 7.83 7.67 7.83 7.83 ...
##  $ Body         : num  8.25 7.92 7.83 7.83 8.25 7.75 7.92 7.92 7.83 7.92 ...
##  $ Balance      : num  7.92 7.92 7.92 7.75 7.75 7.92 7.75 7.83 8 7.75 ...
##  $ Uniformity   : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Clean.Cup    : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Sweetness    : num  8 8 7.83 7.75 7.58 7.75 8 7.92 7.92 7.75 ...
##  $ Cupper.Points: num  8 8 8 8.08 7.67 7.75 7.83 7.92 7.92 7.83 ...
##  $ Moisture     : num  0.12 0 0 0.12 0.12 0 0 0.1 0.09 0.12 ...
##  $ Quakers      : int  0 0 0 0 0 0 0 0 0 0 ...

#1. Principal Component Analysis (PCA)
#Data Pre-Processing
coffee_scale <- scale(coffee)

#Build Principal Component
pca_coffee <- PCA(coffee_scale, scale.unit = F)

summary(pca_coffee)

## 
## Call:
## PCA(X = coffee_scale, scale.unit = F) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
## Variance               6.938   1.443   0.996   0.941   0.630   0.474
## % of var.             53.418  11.114   7.670   7.244   4.853   3.653
## Cumulative % of var.  53.418  64.531  72.202  79.446  84.299  87.951
##                        Dim.7   Dim.8   Dim.9  Dim.10  Dim.11  Dim.12
## Variance               0.353   0.313   0.247   0.230   0.175   0.155
## % of var.              2.716   2.406   1.902   1.771   1.346   1.192
## Cumulative % of var.  90.668  93.074  94.976  96.746  98.092  99.284
##                       Dim.13
## Variance               0.093
## % of var.              0.716
## Cumulative % of var. 100.000
## 
## Individuals (the 10 first)
##                   Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 1             |  4.858 |  2.793  0.104  0.331 | -2.668  0.456  0.302 |
## 2             |  5.010 |  2.743  0.100  0.300 | -3.450  0.762  0.474 |
## 3             |  5.123 |  2.623  0.092  0.262 | -3.604  0.832  0.495 |
## 4             |  4.770 |  2.278  0.069  0.228 | -2.830  0.513  0.352 |
## 5             |  5.331 |  2.434  0.079  0.208 | -2.997  0.575  0.316 |
## 6             |  5.049 |  2.281  0.069  0.204 | -3.586  0.823  0.504 |
## 7             |  4.597 |  1.978  0.052  0.185 | -3.284  0.691  0.510 |
## 8             |  4.232 |  1.801  0.043  0.181 | -2.679  0.460  0.401 |
## 9             |  4.208 |  1.792  0.043  0.181 | -2.740  0.481  0.424 |
## 10            |  4.503 |  1.805  0.043  0.161 | -2.717  0.473  0.364 |
##                Dim.3    ctr   cos2  
## 1              0.367  0.012  0.006 |
## 2              0.076  0.001  0.000 |
## 3              0.106  0.001  0.000 |
## 4              0.432  0.017  0.008 |
## 5              0.446  0.018  0.007 |
## 6              0.109  0.001  0.000 |
## 7              0.050  0.000  0.000 |
## 8              0.315  0.009  0.006 |
## 9              0.280  0.007  0.004 |
## 10             0.392  0.014  0.008 |
## 
## Variables (the 10 first)
##                  Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## coffeeId      | -0.746  8.025  0.557 |  0.384 10.215  0.148 | -0.083
## Aroma         |  0.855 10.542  0.732 | -0.066  0.303  0.004 |  0.021
## Flavor        |  0.940 12.742  0.885 | -0.075  0.388  0.006 |  0.018
## Aftertaste    |  0.933 12.536  0.871 | -0.087  0.520  0.008 |  0.013
## Acidity       |  0.874 11.012  0.765 | -0.083  0.473  0.007 | -0.004
## Body          |  0.854 10.518  0.730 | -0.084  0.490  0.007 | -0.007
## Balance       |  0.890 11.416  0.793 | -0.076  0.401  0.006 | -0.005
## Uniformity    |  0.596  5.122  0.356 |  0.519 18.670  0.270 | -0.051
## Clean.Cup     |  0.534  4.107  0.285 |  0.523 18.952  0.274 | -0.067
## Sweetness     |  0.412  2.445  0.170 |  0.733 37.229  0.538 | -0.104
##                  ctr   cos2  
## coffeeId       0.687  0.007 |
## Aroma          0.045  0.000 |
## Flavor         0.031  0.000 |
## Aftertaste     0.018  0.000 |
## Acidity        0.001  0.000 |
## Body           0.005  0.000 |
## Balance        0.003  0.000 |
## Uniformity     0.261  0.003 |
## Clean.Cup      0.455  0.005 |
## Sweetness      1.077  0.011 |

plot.PCA(pca_coffee, choix = "var")

a <- dimdesc(pca_coffee)
a_pc1 <- a[[1]]$quanti
as.data.frame(round(a_pc1,4))

##               correlation p.value
## Flavor             0.9407       0
## Aftertaste         0.9330       0
## Balance            0.8904       0
## Cupper.Points      0.8776       0
## Acidity            0.8745       0
## Aroma              0.8556       0
## Body               0.8546       0
## Uniformity         0.5964       0
## Clean.Cup          0.5340       0
## Sweetness          0.4121       0
## Moisture          -0.1752       0
## coffeeId          -0.7465       0

#2. K-Means Clustering # 2.1 Choosing Optimum K

kmeansTunning <- function(data, maxK) {
  withinall <- NULL
  total_k <- NULL
  for (i in 2:maxK) {
    set.seed(654)
    temp <- kmeans(data,i)$tot.withinss
    withinall <- append(withinall, temp)
    total_k <- append(total_k,i)
  }
  plot(x = total_k, y = withinall, type = "o", xlab = "Number of Cluster", ylab = "Total within")
}

# kmeansTunning(your_data, maxK = 10)
kmeansTunning(coffee_scale, maxK = 10)

set.seed(654)
coffee_scale <- coffee_scale[-c(1080,1081),]
coffee_clas <- kmeans(coffee_scale, centers = 5)

set.seed(654)
fviz_cluster(coffee_clas, coffee_scale, habillage = 5, addEllipses = T)

#mencari kluster coffee 929

coffee_scale <- as.data.frame(coffee_scale)
coffee_scale$cluster <- coffee_clas$cluster
coffee_scale[c(929), "cluster"]

## [1] 2

coffee_scale[c(1021,21,1060), "cluster"]

## [1] 2 5 2

coffee_scale[c(218), "cluster"]

## [1] 3

coffee_scale %>% 
  mutate(cluster = coffee_clas$cluster) %>%
  group_by(cluster) %>%
  summarise_if(.predicate = is.numeric, .funs = mean)

## # A tibble: 5 x 14
##   cluster coffeeId    Aroma   Flavor Aftertaste  Acidity     Body  Balance
##     <int>    <dbl>    <dbl>    <dbl>      <dbl>    <dbl>    <dbl>    <dbl>
## 1       1    0.150  -0.0180   0.0129    -0.0172  -0.0533  -0.0735  -0.0453
## 2       2    1.36   -0.859   -1.08      -1.07    -0.832   -0.811   -1.03  
## 3       3   -1.24    0.778    0.889      0.887    0.848    0.764    0.828 
## 4       4    1.73  -19.7    -18.8      -18.2    -19.5    -20.5    -18.0   
## 5       5   -0.273   0.0687   0.104      0.181    0.0309   0.204    0.309 
## # ... with 6 more variables: Uniformity <dbl>, Clean.Cup <dbl>,
## #   Sweetness <dbl>, Cupper.Points <dbl>, Moisture <dbl>, Quakers <dbl>

Quiz UL

Hafni Marfuah

11/21/2019