Scotch Whisky Dataset

Scotch Whisky

Dataset includes scores of 86 malt scotch whiskies

#Load library
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Load data
data_set <- read_csv("whisky.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   RowID = col_double(),
##   Distillery = col_character(),
##   Body = col_double(),
##   Sweetness = col_double(),
##   Smoky = col_double(),
##   Medicinal = col_double(),
##   Tobacco = col_double(),
##   Honey = col_double(),
##   Spicy = col_double(),
##   Winey = col_double(),
##   Nutty = col_double(),
##   Malty = col_double(),
##   Fruity = col_double(),
##   Floral = col_double(),
##   Postcode = col_character(),
##   Latitude = col_double(),
##   Longitude = col_double()
## )

head(data_set, 10)

## # A tibble: 10 x 17
##    RowID Distillery  Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey
##    <dbl> <chr>      <dbl>     <dbl> <dbl>     <dbl>   <dbl> <dbl> <dbl> <dbl>
##  1     1 Aberfeldy      2         2     2         0       0     2     1     2
##  2     2 Aberlour       3         3     1         0       0     4     3     2
##  3     3 AnCnoc         1         3     2         0       0     2     0     0
##  4     4 Ardbeg         4         1     4         4       0     0     2     0
##  5     5 Ardmore        2         2     2         0       0     1     1     1
##  6     6 ArranIsle~     2         3     1         1       0     1     1     1
##  7     7 Auchentos~     0         2     0         0       0     1     1     0
##  8     8 Auchroisk      2         3     1         0       0     2     1     2
##  9     9 Aultmore       2         2     1         0       0     1     0     0
## 10    10 Balblair       2         3     2         1       0     0     2     0
## # ... with 7 more variables: Nutty <dbl>, Malty <dbl>, Fruity <dbl>,
## #   Floral <dbl>, Postcode <chr>, Latitude <dbl>, Longitude <dbl>

dim(data_set)

## [1] 86 17

str(data_set)

## tibble [86 x 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ RowID     : num [1:86] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Distillery: chr [1:86] "Aberfeldy" "Aberlour" "AnCnoc" "Ardbeg" ...
##  $ Body      : num [1:86] 2 3 1 4 2 2 0 2 2 2 ...
##  $ Sweetness : num [1:86] 2 3 3 1 2 3 2 3 2 3 ...
##  $ Smoky     : num [1:86] 2 1 2 4 2 1 0 1 1 2 ...
##  $ Medicinal : num [1:86] 0 0 0 4 0 1 0 0 0 1 ...
##  $ Tobacco   : num [1:86] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Honey     : num [1:86] 2 4 2 0 1 1 1 2 1 0 ...
##  $ Spicy     : num [1:86] 1 3 0 2 1 1 1 1 0 2 ...
##  $ Winey     : num [1:86] 2 2 0 0 1 1 0 2 0 0 ...
##  $ Nutty     : num [1:86] 2 2 2 1 2 0 2 2 2 2 ...
##  $ Malty     : num [1:86] 2 3 2 2 3 1 2 2 2 1 ...
##  $ Fruity    : num [1:86] 2 3 3 1 1 1 3 2 2 2 ...
##  $ Floral    : num [1:86] 2 2 2 0 1 2 3 1 2 1 ...
##  $ Postcode  : chr [1:86] "PH15 2EB" "AB38 9PJ" "AB5 5LI" "PA42 7EB" ...
##  $ Latitude  : num [1:86] 286580 326340 352960 141560 355350 ...
##  $ Longitude : num [1:86] 749680 842570 839320 646220 829140 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   RowID = col_double(),
##   ..   Distillery = col_character(),
##   ..   Body = col_double(),
##   ..   Sweetness = col_double(),
##   ..   Smoky = col_double(),
##   ..   Medicinal = col_double(),
##   ..   Tobacco = col_double(),
##   ..   Honey = col_double(),
##   ..   Spicy = col_double(),
##   ..   Winey = col_double(),
##   ..   Nutty = col_double(),
##   ..   Malty = col_double(),
##   ..   Fruity = col_double(),
##   ..   Floral = col_double(),
##   ..   Postcode = col_character(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double()
##   .. )

names(data_set)

##  [1] "RowID"      "Distillery" "Body"       "Sweetness"  "Smoky"     
##  [6] "Medicinal"  "Tobacco"    "Honey"      "Spicy"      "Winey"     
## [11] "Nutty"      "Malty"      "Fruity"     "Floral"     "Postcode"  
## [16] "Latitude"   "Longitude"

Data visualization

#Check if dataset has NAs
library(naniar)
vis_miss(data_set)

#Correlation of dataset
library(corrplot)

## corrplot 0.84 loaded

correlations <- cor(data_set[,3:14])
correlations

##                  Body    Sweetness       Smoky   Medicinal      Tobacco
## Body       1.00000000 -0.136517815  0.52403158  0.35405026  0.168718106
## Sweetness -0.13651781  1.000000000 -0.40589707 -0.39201677 -0.147870502
## Smoky      0.52403158 -0.405897070  1.00000000  0.68607052  0.365500702
## Medicinal  0.35405026 -0.392016773  0.68607052  1.00000000  0.425105554
## Tobacco    0.16871811 -0.147870502  0.36550070  0.42510555  1.000000000
## Honey      0.08203083  0.132558139 -0.19531751 -0.39662863 -0.275490317
## Spicy      0.18849952 -0.054199926  0.23174451  0.04490323  0.054067800
## Winey      0.40857633  0.115727403 -0.02819042 -0.20265132  0.009096926
## Nutty      0.12632342 -0.032492924 -0.02313214 -0.11367135 -0.117717350
## Malty     -0.11685910 -0.001515808 -0.19287531 -0.25895929 -0.059347395
## Fruity    -0.01320457  0.019819512 -0.31296978 -0.33097549 -0.235145442
## Floral    -0.46120320  0.144986800 -0.43166295 -0.51132279 -0.212375174
##                 Honey       Spicy        Winey       Nutty        Malty
## Body       0.08203083  0.18849952  0.408576331  0.12632342 -0.116859104
## Sweetness  0.13255814 -0.05419993  0.115727403 -0.03249292 -0.001515808
## Smoky     -0.19531751  0.23174451 -0.028190424 -0.02313214 -0.192875307
## Medicinal -0.39662863  0.04490323 -0.202651322 -0.11367135 -0.258959287
## Tobacco   -0.27549032  0.05406780  0.009096926 -0.11771735 -0.059347395
## Honey      1.00000000  0.13956277  0.362020513  0.18849200  0.310184243
## Spicy      0.13956277  1.00000000  0.092704030 -0.04285577  0.036303005
## Winey      0.36202051  0.09270403  1.000000000  0.19846716  0.112368417
## Nutty      0.18849200 -0.04285577  0.198467159  1.00000000  0.066157368
## Malty      0.31018424  0.03630300  0.112368417  0.06615737  1.000000000
## Fruity     0.10882249  0.14471361  0.090693903  0.07176477  0.207288023
## Floral     0.18302902  0.03466260 -0.126931653  0.01830236  0.106308708
##                Fruity      Floral
## Body      -0.01320457 -0.46120320
## Sweetness  0.01981951  0.14498680
## Smoky     -0.31296978 -0.43166295
## Medicinal -0.33097549 -0.51132279
## Tobacco   -0.23514544 -0.21237517
## Honey      0.10882249  0.18302902
## Spicy      0.14471361  0.03466260
## Winey      0.09069390 -0.12693165
## Nutty      0.07176477  0.01830236
## Malty      0.20728802  0.10630871
## Fruity     1.00000000  0.26233561
## Floral     0.26233561  1.00000000

corrplot(correlations, method = "color", tl.col = "black", tl.cex = .5,
         addCoef.col = "black", addcolorlabel = "no", order = "AOE",
         number.cex = .5, is.corr = T)

## Warning in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt =
## tl.srt, : "addcolorlabel" is not a graphical parameter

## Warning in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col =
## tl.col, : "addcolorlabel" is not a graphical parameter

## Warning in title(title, ...): "addcolorlabel" is not a graphical parameter

#### Data processing

r whisky <- as.data.frame(data_set[,2:14]) head(whisky)

##    Distillery Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty ## 1   Aberfeldy    2         2     2         0       0     2     1     2     2 ## 2    Aberlour    3         3     1         0       0     4     3     2     2 ## 3      AnCnoc    1         3     2         0       0     2     0     0     2 ## 4      Ardbeg    4         1     4         4       0     0     2     0     1 ## 5     Ardmore    2         2     2         0       0     1     1     1     2 ## 6 ArranIsleOf    2         3     1         1       0     1     1     1     0 ##   Malty Fruity Floral ## 1     2      2      2 ## 2     3      3      2 ## 3     2      3      2 ## 4     2      1      0 ## 5     3      1      1 ## 6     1      1      2

r rownames(whisky) <- whisky$Distillery whisky$Distillery <- NULL head(whisky)

##             Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty ## Aberfeldy      2         2     2         0       0     2     1     2     2 ## Aberlour       3         3     1         0       0     4     3     2     2 ## AnCnoc         1         3     2         0       0     2     0     0     2 ## Ardbeg         4         1     4         4       0     0     2     0     1 ## Ardmore        2         2     2         0       0     1     1     1     2 ## ArranIsleOf    2         3     1         1       0     1     1     1     0 ##             Malty Fruity Floral ## Aberfeldy       2      2      2 ## Aberlour        3      3      2 ## AnCnoc          2      3      2 ## Ardbeg          2      1      0 ## Ardmore         3      1      1 ## ArranIsleOf     1      1      2

Elbow method

set.seed(23)
wcss <- vector()
for (i in 1:10){
  wcss[i] <- sum(kmeans(whisky, i)$withinss)
}
plot(1:10, wcss, type = 'b', main = "Elbow method",
     xlab = "Number of clusters (k)", ylab = "WCSS(k)")

#I chose k = 4

Apply the k-means algorithm with optimal k

k_means <- kmeans(whisky, 4, iter.max = 300, nstart = 10)
k_means

## K-means clustering with 4 clusters of sizes 37, 15, 6, 28
## 
## Cluster means:
##       Body Sweetness    Smoky  Medicinal    Tobacco     Honey    Spicy
## 1 1.432432  2.486486 1.054054 0.24324324 0.05405405 0.9729730 1.108108
## 2 1.866667  1.933333 2.066667 1.06666667 0.20000000 1.1333333 1.466667
## 3 3.666667  1.500000 3.666667 3.33333333 0.66666667 0.1666667 1.666667
## 4 2.678571  2.392857 1.428571 0.07142857 0.03571429 1.8928571 1.642857
##       Winey    Nutty    Malty   Fruity    Floral
## 1 0.4594595 1.162162 1.675676 1.972973 2.1081081
## 2 0.8666667 1.533333 1.800000 1.066667 1.1333333
## 3 0.5000000 1.166667 1.333333 1.166667 0.1666667
## 4 1.8214286 1.892857 2.071429 2.107143 1.7857143
## 
## Clustering vector:
##          Aberfeldy           Aberlour             AnCnoc             Ardbeg 
##                  4                  4                  1                  3 
##            Ardmore        ArranIsleOf       Auchentoshan          Auchroisk 
##                  2                  1                  1                  4 
##           Aultmore           Balblair          Balmenach           Belvenie 
##                  1                  2                  4                  4 
##           BenNevis           Benriach          Benrinnes          Benromach 
##                  4                  1                  4                  4 
##           Bladnoch         BlairAthol            Bowmore      Bruichladdich 
##                  1                  4                  2                  2 
##       Bunnahabhain           Caol Ila             Cardhu          Clynelish 
##                  1                  3                  1                  3 
##      Craigallechie       Craigganmore          Dailuaine            Dalmore 
##                  4                  1                  4                  4 
##         Dalwhinnie           Deanston           Dufftown           Edradour 
##                  1                  4                  1                  4 
## GlenDeveronMacduff          GlenElgin        GlenGarioch          GlenGrant 
##                  2                  1                  2                  1 
##          GlenKeith          GlenMoray            GlenOrd         GlenScotia 
##                  1                  1                  4                  2 
##           GlenSpey       Glenallachie        Glendronach         Glendullan 
##                  1                  1                  4                  4 
##        Glenfarclas        Glenfiddich          Glengoyne        Glenkinchie 
##                  4                  1                  1                  1 
##          Glenlivet         Glenlossie       Glenmorangie         Glenrothes 
##                  4                  1                  1                  4 
##         Glenturret      Highland Park          Inchgower       Isle of Jura 
##                  4                  2                  1                  2 
##          Knochando          Lagavulin           Laphroig           Linkwood 
##                  4                  3                  3                  1 
##        Loch Lomond           Longmorn           Macallan        Mannochmore 
##                  1                  4                  4                  1 
##         Miltonduff           Mortlach               Oban     OldFettercairn 
##                  1                  4                  2                  2 
##        OldPulteney       RoyalBrackla     RoyalLochnagar              Scapa 
##                  2                  1                  4                  4 
##           Speyburn           Speyside         Springbank         Strathisla 
##                  1                  1                  2                  4 
##         Strathmill           Talisker             Tamdhu         Tamnavulin 
##                  1                  3                  1                  1 
##          Teaninich          Tobermory            Tomatin          Tomintoul 
##                  1                  1                  2                  1 
##            Tormore       Tullibardine 
##                  2                  1 
## 
## Within cluster sum of squares by cluster:
## [1] 162.32432  78.93333  24.33333 137.60714
##  (between_SS / total_SS =  39.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

summary(k_means)

##              Length Class  Mode   
## cluster      86     -none- numeric
## centers      48     -none- numeric
## totss         1     -none- numeric
## withinss      4     -none- numeric
## tot.withinss  1     -none- numeric
## betweenss     1     -none- numeric
## size          4     -none- numeric
## iter          1     -none- numeric
## ifault        1     -none- numeric

table(k_means$cluster)

## 
##  1  2  3  4 
## 37 15  6 28

data.frame(k_means$centers)

##       Body Sweetness    Smoky  Medicinal    Tobacco     Honey    Spicy
## 1 1.432432  2.486486 1.054054 0.24324324 0.05405405 0.9729730 1.108108
## 2 1.866667  1.933333 2.066667 1.06666667 0.20000000 1.1333333 1.466667
## 3 3.666667  1.500000 3.666667 3.33333333 0.66666667 0.1666667 1.666667
## 4 2.678571  2.392857 1.428571 0.07142857 0.03571429 1.8928571 1.642857
##       Winey    Nutty    Malty   Fruity    Floral
## 1 0.4594595 1.162162 1.675676 1.972973 2.1081081
## 2 0.8666667 1.533333 1.800000 1.066667 1.1333333
## 3 0.5000000 1.166667 1.333333 1.166667 0.1666667
## 4 1.8214286 1.892857 2.071429 2.107143 1.7857143

aggregate(whisky, by = list(cluster = k_means$cluster), mean)

##   cluster     Body Sweetness    Smoky  Medicinal    Tobacco     Honey    Spicy
## 1       1 1.432432  2.486486 1.054054 0.24324324 0.05405405 0.9729730 1.108108
## 2       2 1.866667  1.933333 2.066667 1.06666667 0.20000000 1.1333333 1.466667
## 3       3 3.666667  1.500000 3.666667 3.33333333 0.66666667 0.1666667 1.666667
## 4       4 2.678571  2.392857 1.428571 0.07142857 0.03571429 1.8928571 1.642857
##       Winey    Nutty    Malty   Fruity    Floral
## 1 0.4594595 1.162162 1.675676 1.972973 2.1081081
## 2 0.8666667 1.533333 1.800000 1.066667 1.1333333
## 3 0.5000000 1.166667 1.333333 1.166667 0.1666667
## 4 1.8214286 1.892857 2.071429 2.107143 1.7857143

y_kmeans <- k_means$cluster
whisky_cluster <- cbind(whisky, y_kmeans)
head(whisky_cluster, 10)

##              Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty
## Aberfeldy       2         2     2         0       0     2     1     2     2
## Aberlour        3         3     1         0       0     4     3     2     2
## AnCnoc          1         3     2         0       0     2     0     0     2
## Ardbeg          4         1     4         4       0     0     2     0     1
## Ardmore         2         2     2         0       0     1     1     1     2
## ArranIsleOf     2         3     1         1       0     1     1     1     0
## Auchentoshan    0         2     0         0       0     1     1     0     2
## Auchroisk       2         3     1         0       0     2     1     2     2
## Aultmore        2         2     1         0       0     1     0     0     2
## Balblair        2         3     2         1       0     0     2     0     2
##              Malty Fruity Floral y_kmeans
## Aberfeldy        2      2      2        4
## Aberlour         3      3      2        4
## AnCnoc           2      3      2        1
## Ardbeg           2      1      0        3
## Ardmore          3      1      1        2
## ArranIsleOf      1      1      2        1
## Auchentoshan     2      3      3        1
## Auchroisk        2      2      1        4
## Aultmore         2      2      2        1
## Balblair         1      2      1        2

Viewing the clusters

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_cluster(k_means, data = whisky, geom = c("text"), labelsize = 7,
             main = "Clustering of Scotch Whisky")

#### Clustering model validation

r library(fpc) library(NbClust) nb <- NbClust(whisky, distance = "euclidean", min.nc = 2, max.nc = 12, method = "kmeans", index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters. ##                 In the plot of Hubert index, we seek a significant knee that corresponds to a ##                 significant increase of the value of the measure i.e the significant peak in Hubert ##                 index second differences plot. ##

## *** : The D index is a graphical method of determining the number of clusters. ##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex ##                 second differences plot) that corresponds to a significant increase of the value of ##                 the measure. ## ## ******************************************************************* ## * Among all indices: ## * 11 proposed 2 as the best number of clusters ## * 7 proposed 3 as the best number of clusters ## * 2 proposed 5 as the best number of clusters ## * 2 proposed 7 as the best number of clusters ## * 1 proposed 9 as the best number of clusters ## * 1 proposed 12 as the best number of clusters ## ##                    ***** Conclusion ***** ## ## * According to the majority rule, the best number of clusters is  2 ## ## ## *******************************************************************

r fviz_nbclust(nb) + theme_minimal()

## Among all indices: ## =================== ## * 2 proposed  0 as the best number of clusters ## * 11 proposed  2 as the best number of clusters ## * 7 proposed  3 as the best number of clusters ## * 2 proposed  5 as the best number of clusters ## * 2 proposed  7 as the best number of clusters ## * 1 proposed  9 as the best number of clusters ## * 1 proposed  12 as the best number of clusters ## ## Conclusion ## ========================= ## * According to the majority rule, the best number of clusters is  2 .

Geospatial visualization

In this part I used the library ‘ggmap’ and I didn’t show the command register_google. Also I investigated the Latitude and Longitude of all companies of dataset.

zone <- read.csv("Zone.txt")
head(zone)

##        Lat       Lon
## 1 56.62759 -3.847860
## 2 57.46793 -3.229942
## 3 57.65829 -3.082305
## 4 55.64025 -6.110231
## 5 57.35170 -2.747930
## 6 55.66252 -5.272552

zone$Postcode <- data_set$Postcode
head(zone)

##        Lat       Lon Postcode
## 1 56.62759 -3.847860 PH15 2EB
## 2 57.46793 -3.229942 AB38 9PJ
## 3 57.65829 -3.082305  AB5 5LI
## 4 55.64025 -6.110231 PA42 7EB
## 5 57.35170 -2.747930 AB54 4NH
## 6 55.66252 -5.272552 KA27 8HJ

zone <- zone[,c(3,1,2)]
head(zone)

##   Postcode      Lat       Lon
## 1 PH15 2EB 56.62759 -3.847860
## 2 AB38 9PJ 57.46793 -3.229942
## 3  AB5 5LI 57.65829 -3.082305
## 4 PA42 7EB 55.64025 -6.110231
## 5 AB54 4NH 57.35170 -2.747930
## 6 KA27 8HJ 55.66252 -5.272552

data_set2 <- cbind(data_set, zone[,2:3])
head(data_set2)

##   RowID  Distillery Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey
## 1     1   Aberfeldy    2         2     2         0       0     2     1     2
## 2     2    Aberlour    3         3     1         0       0     4     3     2
## 3     3      AnCnoc    1         3     2         0       0     2     0     0
## 4     4      Ardbeg    4         1     4         4       0     0     2     0
## 5     5     Ardmore    2         2     2         0       0     1     1     1
## 6     6 ArranIsleOf    2         3     1         1       0     1     1     1
##   Nutty Malty Fruity Floral Postcode Latitude Longitude      Lat       Lon
## 1     2     2      2      2 PH15 2EB   286580    749680 56.62759 -3.847860
## 2     2     3      3      2 AB38 9PJ   326340    842570 57.46793 -3.229942
## 3     2     2      3      2  AB5 5LI   352960    839320 57.65829 -3.082305
## 4     1     2      1      0 PA42 7EB   141560    646220 55.64025 -6.110231
## 5     2     3      1      1 AB54 4NH   355350    829140 57.35170 -2.747930
## 6     0     1      1      2 KA27 8HJ   194050    649950 55.66252 -5.272552

data_set2 -> scotland_zone
scotland_zone <- scotland_zone %>% select(Distillery, Lat, Lon)
scotland_zone$cluster <- y_kmeans
head(scotland_zone)

##    Distillery      Lat       Lon cluster
## 1   Aberfeldy 56.62759 -3.847860       4
## 2    Aberlour 57.46793 -3.229942       4
## 3      AnCnoc 57.65829 -3.082305       1
## 4      Ardbeg 55.64025 -6.110231       3
## 5     Ardmore 57.35170 -2.747930       2
## 6 ArranIsleOf 55.66252 -5.272552       1

library(ggmap)

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.

## Please cite ggmap if you use it! See citation("ggmap") for details.

library(maps)

## 
## Attaching package: 'maps'

## The following object is masked from 'package:purrr':
## 
##     map

Geospatial visualization

scotland <- get_map(location = c(lon = -4.0000000, lat = 57.0000000), zoom = 6)

## Source : https://maps.googleapis.com/maps/api/staticmap?center=57,-4&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx

ggmap(scotland) +
  geom_point(data = scotland_zone, aes(x = Lon, y = Lat, color = as.character(cluster))) +
  scale_color_manual(values = c('red', 'green', 'blue', 'purple')) +
  labs(color = 'Cluster', title = "Scotch Whisky Cluster", x = 'Longitude', y = 'Latitude')

Analysis

whisky_cluster %>% filter(y_kmeans == 4)

##                Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty
## Aberfeldy         2         2     2         0       0     2     1     2     2
## Aberlour          3         3     1         0       0     4     3     2     2
## Auchroisk         2         3     1         0       0     2     1     2     2
## Balmenach         4         3     2         0       0     2     1     3     3
## Belvenie          3         2     1         0       0     3     2     1     0
## BenNevis          4         2     2         0       0     2     2     0     2
## Benrinnes         3         2     2         0       0     3     1     1     2
## Benromach         2         2     2         0       0     2     2     1     2
## BlairAthol        2         2     2         0       0     1     2     2     2
## Craigallechie     2         2     2         0       1     2     2     1     2
## Dailuaine         4         2     2         0       0     1     2     2     2
## Dalmore           3         2     2         1       0     1     2     2     1
## Deanston          2         2     1         0       0     2     1     1     1
## Edradour          2         3     1         0       0     2     1     1     4
## GlenOrd           3         2     1         0       0     1     2     1     1
## Glendronach       4         2     2         0       0     2     1     4     2
## Glendullan        3         2     1         0       0     2     1     2     1
## Glenfarclas       2         4     1         0       0     1     2     3     2
## Glenlivet         2         3     1         0       0     2     2     2     1
## Glenrothes        2         3     1         0       0     1     1     2     1
## Glenturret        2         3     1         0       0     2     2     2     2
## Knochando         2         3     1         0       0     2     2     1     2
## Longmorn          3         2     1         0       0     1     1     1     3
## Macallan          4         3     1         0       0     2     1     4     2
## Mortlach          3         2     2         0       0     2     3     3     2
## RoyalLochnagar    3         2     2         0       0     2     2     2     2
## Scapa             2         2     1         1       0     2     1     1     2
## Strathisla        2         2     1         0       0     2     2     2     3
##                Malty Fruity Floral y_kmeans
## Aberfeldy          2      2      2        4
## Aberlour           3      3      2        4
## Auchroisk          2      2      1        4
## Balmenach          0      1      2        4
## Belvenie           2      2      2        4
## BenNevis           2      2      2        4
## Benrinnes          3      2      2        4
## Benromach          2      2      2        4
## BlairAthol         2      2      2        4
## Craigallechie      2      1      4        4
## Dailuaine          2      2      1        4
## Dalmore            2      3      1        4
## Deanston           3      2      1        4
## Edradour           2      2      2        4
## GlenOrd            2      2      2        4
## Glendronach        2      2      0        4
## Glendullan         2      3      2        4
## Glenfarclas        3      2      2        4
## Glenlivet          2      2      3        4
## Glenrothes         2      2      0        4
## Glenturret         2      1      2        4
## Knochando          1      2      2        4
## Longmorn           3      2      3        4
## Macallan           2      3      1        4
## Mortlach           1      2      2        4
## RoyalLochnagar     2      3      1        4
## Scapa              2      2      2        4
## Strathisla         3      3      2        4

c1 <- whisky_cluster %>% filter(y_kmeans == 1) %>% select(-y_kmeans)
head(c1)

##              Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty
## AnCnoc          1         3     2         0       0     2     0     0     2
## ArranIsleOf     2         3     1         1       0     1     1     1     0
## Auchentoshan    0         2     0         0       0     1     1     0     2
## Aultmore        2         2     1         0       0     1     0     0     2
## Benriach        2         2     1         0       0     2     2     0     0
## Bladnoch        1         2     1         0       0     0     1     1     0
##              Malty Fruity Floral
## AnCnoc           2      3      2
## ArranIsleOf      1      1      2
## Auchentoshan     2      3      3
## Aultmore         2      2      2
## Benriach         2      3      2
## Bladnoch         2      2      3

c1 <- apply(c1, 2, mean)

c2 <- whisky_cluster %>% filter(y_kmeans == 2) %>% select(-y_kmeans)
head(c2)

##                    Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey
## Ardmore               2         2     2         0       0     1     1     1
## Balblair              2         3     2         1       0     0     2     0
## Bowmore               2         2     3         1       0     2     2     1
## Bruichladdich         1         1     2         2       0     2     2     1
## GlenDeveronMacduff    2         3     1         1       1     1     1     2
## GlenGarioch           2         1     3         0       0     0     3     1
##                    Nutty Malty Fruity Floral
## Ardmore                2     3      1      1
## Balblair               2     1      2      1
## Bowmore                1     1      1      2
## Bruichladdich          2     2      2      2
## GlenDeveronMacduff     0     2      0      1
## GlenGarioch            0     2      2      2

c2 <- apply(c2, 2, mean)

c3 <- whisky_cluster %>% filter(y_kmeans == 3) %>% select(-y_kmeans)
head(c3)

##           Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty Malty
## Ardbeg       4         1     4         4       0     0     2     0     1     2
## Caol Ila     3         1     4         2       1     0     2     0     2     1
## Clynelish    3         2     3         3       1     0     2     0     1     1
## Lagavulin    4         1     4         4       1     0     1     2     1     1
## Laphroig     4         2     4         4       1     0     0     1     1     1
## Talisker     4         2     3         3       0     1     3     0     1     2
##           Fruity Floral
## Ardbeg         1      0
## Caol Ila       1      1
## Clynelish      2      0
## Lagavulin      1      0
## Laphroig       0      0
## Talisker       2      0

c3 <- apply(c3, 2, mean)

c4 <- whisky_cluster %>% filter(y_kmeans == 4) %>% select(-y_kmeans)
head(c4)

##           Body Sweetness Smoky Medicinal Tobacco Honey Spicy Winey Nutty Malty
## Aberfeldy    2         2     2         0       0     2     1     2     2     2
## Aberlour     3         3     1         0       0     4     3     2     2     3
## Auchroisk    2         3     1         0       0     2     1     2     2     2
## Balmenach    4         3     2         0       0     2     1     3     3     0
## Belvenie     3         2     1         0       0     3     2     1     0     2
## BenNevis     4         2     2         0       0     2     2     0     2     2
##           Fruity Floral
## Aberfeldy      2      2
## Aberlour       3      2
## Auchroisk      2      1
## Balmenach      1      2
## Belvenie       2      2
## BenNevis       2      2

c4 <- apply(c4, 2, mean)

analysis <- data.frame('c1' = c1,
                       'c2' = c2,
                       'c3' = c3,
                       'c4' = c4)
head(analysis)

##                   c1       c2        c3         c4
## Body      1.43243243 1.866667 3.6666667 2.67857143
## Sweetness 2.48648649 1.933333 1.5000000 2.39285714
## Smoky     1.05405405 2.066667 3.6666667 1.42857143
## Medicinal 0.24324324 1.066667 3.3333333 0.07142857
## Tobacco   0.05405405 0.200000 0.6666667 0.03571429
## Honey     0.97297297 1.133333 0.1666667 1.89285714

analysis %>% arrange(desc(c1))

##                   c1        c2        c3         c4
## Sweetness 2.48648649 1.9333333 1.5000000 2.39285714
## Floral    2.10810811 1.1333333 0.1666667 1.78571429
## Fruity    1.97297297 1.0666667 1.1666667 2.10714286
## Malty     1.67567568 1.8000000 1.3333333 2.07142857
## Body      1.43243243 1.8666667 3.6666667 2.67857143
## Nutty     1.16216216 1.5333333 1.1666667 1.89285714
## Spicy     1.10810811 1.4666667 1.6666667 1.64285714
## Smoky     1.05405405 2.0666667 3.6666667 1.42857143
## Honey     0.97297297 1.1333333 0.1666667 1.89285714
## Winey     0.45945946 0.8666667 0.5000000 1.82142857
## Medicinal 0.24324324 1.0666667 3.3333333 0.07142857
## Tobacco   0.05405405 0.2000000 0.6666667 0.03571429

analysis %>% arrange(desc(c2))

##                   c1        c2        c3         c4
## Smoky     1.05405405 2.0666667 3.6666667 1.42857143
## Sweetness 2.48648649 1.9333333 1.5000000 2.39285714
## Body      1.43243243 1.8666667 3.6666667 2.67857143
## Malty     1.67567568 1.8000000 1.3333333 2.07142857
## Nutty     1.16216216 1.5333333 1.1666667 1.89285714
## Spicy     1.10810811 1.4666667 1.6666667 1.64285714
## Honey     0.97297297 1.1333333 0.1666667 1.89285714
## Floral    2.10810811 1.1333333 0.1666667 1.78571429
## Medicinal 0.24324324 1.0666667 3.3333333 0.07142857
## Fruity    1.97297297 1.0666667 1.1666667 2.10714286
## Winey     0.45945946 0.8666667 0.5000000 1.82142857
## Tobacco   0.05405405 0.2000000 0.6666667 0.03571429

analysis %>% arrange(desc(c3))

##                   c1        c2        c3         c4
## Body      1.43243243 1.8666667 3.6666667 2.67857143
## Smoky     1.05405405 2.0666667 3.6666667 1.42857143
## Medicinal 0.24324324 1.0666667 3.3333333 0.07142857
## Spicy     1.10810811 1.4666667 1.6666667 1.64285714
## Sweetness 2.48648649 1.9333333 1.5000000 2.39285714
## Malty     1.67567568 1.8000000 1.3333333 2.07142857
## Nutty     1.16216216 1.5333333 1.1666667 1.89285714
## Fruity    1.97297297 1.0666667 1.1666667 2.10714286
## Tobacco   0.05405405 0.2000000 0.6666667 0.03571429
## Winey     0.45945946 0.8666667 0.5000000 1.82142857
## Honey     0.97297297 1.1333333 0.1666667 1.89285714
## Floral    2.10810811 1.1333333 0.1666667 1.78571429

analysis %>% arrange(desc(c4))

##                   c1        c2        c3         c4
## Body      1.43243243 1.8666667 3.6666667 2.67857143
## Sweetness 2.48648649 1.9333333 1.5000000 2.39285714
## Fruity    1.97297297 1.0666667 1.1666667 2.10714286
## Malty     1.67567568 1.8000000 1.3333333 2.07142857
## Honey     0.97297297 1.1333333 0.1666667 1.89285714
## Nutty     1.16216216 1.5333333 1.1666667 1.89285714
## Winey     0.45945946 0.8666667 0.5000000 1.82142857
## Floral    2.10810811 1.1333333 0.1666667 1.78571429
## Spicy     1.10810811 1.4666667 1.6666667 1.64285714
## Smoky     1.05405405 2.0666667 3.6666667 1.42857143
## Medicinal 0.24324324 1.0666667 3.3333333 0.07142857
## Tobacco   0.05405405 0.2000000 0.6666667 0.03571429

Conclusion

Cluster 1 companies produce whiskey that focuses a lot on product image.
Cluster 2 companies produce better quality whiskey focused on the classic without experimenting too much.
Cluster 3 companies produce quality whiskeys that are characterized by strong flavors.
Cluster 4 companies produce quality whiskey characterized by sweet flavors.

Scotch Whisky Dataset

Adrián

1/2/2021

Scotch Whisky

Dataset includes scores of 86 malt scotch whiskies

Data visualization

Elbow method

Apply the k-means algorithm with optimal k

Viewing the clusters

Geospatial visualization

Geospatial visualization

Analysis

Conclusion