Clusters Basic Guide

Segmentation or clusters is a set of techniques whose purpose is to form groups from a set of elements.

Step 1 Install and call libraries

library(ggplot2)
library(data.table)
library(cluster)
library(factoextra)
library(readr)
library(dplyr)

Step 2 Obtain data

df <- readxl::read_excel("/Users/valeriacantulobo/Downloads/wine.xlsx")
df
## # A tibble: 178 × 13
##    Alcohol Malic_Acid   Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
##      <dbl>      <dbl> <dbl>        <dbl>     <dbl>         <dbl>      <dbl>
##  1    14.2       1.71  2.43         15.6       127          2.8        3.06
##  2    13.2       1.78  2.14         11.2       100          2.65       2.76
##  3    13.2       2.36  2.67         18.6       101          2.8        3.24
##  4    14.4       1.95  2.5          16.8       113          3.85       3.49
##  5    13.2       2.59  2.87         21         118          2.8        2.69
##  6    14.2       1.76  2.45         15.2       112          3.27       3.39
##  7    14.4       1.87  2.45         14.6        96          2.5        2.52
##  8    14.1       2.15  2.61         17.6       121          2.6        2.51
##  9    14.8       1.64  2.17         14          97          2.8        2.98
## 10    13.9       1.35  2.27         16          98          2.98       3.15
## # ℹ 168 more rows
## # ℹ 6 more variables: Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>,
## #   Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>, Proline <dbl>

Step 3 Understand the Data

summary(df)
##     Alcohol        Malic_Acid         Ash         Ash_Alcanity  
##  Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60  
##  1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20  
##  Median :13.05   Median :1.865   Median :2.360   Median :19.50  
##  Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49  
##  3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50  
##  Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00  
##    Magnesium      Total_Phenols     Flavanoids    Nonflavanoid_Phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  Proanthocyanins Color_Intensity       Hue             OD280      
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270  
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938  
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780  
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612  
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170  
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000  
##     Proline      
##  Min.   : 278.0  
##  1st Qu.: 500.5  
##  Median : 673.5  
##  Mean   : 746.9  
##  3rd Qu.: 985.0  
##  Max.   :1680.0

Step 4 Scale the data

df_scaled <- as.data.frame(scale(df))

Step 5 Number of Clusters

groups <- 4

Step 6 Generate Clusters

segments <- kmeans(df_scaled,groups)
segments
## K-means clustering with 4 clusters of sizes 59, 40, 50, 29
## 
## Cluster means:
##      Alcohol  Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.9028300 -0.29698179  0.2925626   -0.6927815  0.55567318    0.89175848
## 2 -1.0148325 -0.08176935  0.2496251    0.7224166 -0.51927755   -0.03732895
## 3  0.1766166  0.90395669  0.2153615    0.5494898 -0.07712756   -0.98731537
## 4 -0.7415344 -0.84155637 -1.3108371   -0.5343809 -0.28128407   -0.06051117
##   Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1  0.9514773           -0.6073179       0.5966082       0.1931051  0.4694211
## 2  0.1901974            0.2723596       0.2622500      -0.9505255  0.3185023
## 3 -1.2236663            0.7114800      -0.7591372       0.9516989 -1.1867156
## 4 -0.0883360           -0.3667803      -0.2666562      -0.7226595  0.6517187
##        OD280    Proline
## 1  0.7749105  1.1750720
## 2  0.3736876 -0.8357003
## 3 -1.2857714 -0.3952058
## 4  0.1248741 -0.5565843
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 2 2 4 4 4 4 4 2 2 1
##  [75] 4 4 4 4 4 2 4 4 2 3 2 4 2 2 2 2 2 2 2 2 4 1 2 4 4 2 4 4 2 4 4 2 4 2 4 2 2
## [112] 2 2 2 2 2 4 2 3 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 322.0013 332.1551 314.6524 211.4465
##  (between_SS / total_SS =  48.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Step 7 Assign data to Clusters

assignation <- cbind(df, cluster = segments$cluster)
#assignation

Step 8 Graph Clusters

fviz_cluster(segments, data = df_scaled)

Step 9 Optimize number of clusters

set.seed(123)
optimization = clusGap(df_scaled, FUN=kmeans, nstart=1, K.max = 12)
plot(optimization, xlab="Number of k clusters: ")

Step 10 Run it back with the right number of Clusters

# ------------ RUN IT BACK WITH OPTIMIZED CLUSTER NUMBER -----------------
# Step 3 Number of Clusters
groups <- 3 #Changed to optimized number of clusters

# Step 4 Generate Clusters
segments <- kmeans(df_scaled,groups)
segments
## K-means clustering with 3 clusters of sizes 51, 62, 65
## 
## Cluster means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.1644436  0.8690954  0.1863726    0.5228924 -0.07526047   -0.97657548
## 2  0.8328826 -0.3029551  0.3636801   -0.6084749  0.57596208    0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257    0.1701220 -0.49032869   -0.07576891
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1 -1.21182921           0.72402116     -0.77751312       0.9388902 -1.1615122
## 2  0.97506900          -0.56050853      0.57865427       0.1705823  0.4726504
## 3  0.02075402          -0.03343924      0.05810161      -0.8993770  0.4605046
##        OD280    Proline
## 1 -1.2887761 -0.4059428
## 2  0.7770551  1.1220202
## 3  0.2700025 -0.7517257
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2
##  [75] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 1 3 3 2 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 326.3537 385.6983 558.6971
##  (between_SS / total_SS =  44.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Step 5 Assign data to Clusters
assignation <- cbind(df, cluster = segments$cluster)
#assignation

# Step 6 Graph Clusters
fviz_cluster(segments, data = df_scaled)

Observations

Segmentation or clusters are a useful algorithm for companies that want to classify their customers and direct more focused and specialized marketing campaigns.

Step 11 Cluster Classification Analisis

# Grouping by 'assignation$cluster' and calculating mean of all numeric variables
analisis <- assignation %>%
  group_by(assignation$cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `assignation$cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
# Display the result
print(analisis)
## # A tibble: 3 × 15
##   `assignation$cluster` Alcohol Malic_Acid   Ash Ash_Alcanity Magnesium
##                   <int>   <dbl>      <dbl> <dbl>        <dbl>     <dbl>
## 1                     1    13.1       3.31  2.42         21.2      98.7
## 2                     2    13.7       2.00  2.47         17.5     108. 
## 3                     3    12.3       1.90  2.23         20.1      92.7
## # ℹ 9 more variables: Total_Phenols <dbl>, Flavanoids <dbl>,
## #   Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>, Color_Intensity <dbl>,
## #   Hue <dbl>, OD280 <dbl>, Proline <dbl>, cluster <dbl>
LS0tCnRpdGxlOiAiQ2x1c3RlcnMgV2luZSIKYXV0aG9yOiAiVmFsZXJpYSBDYW50w7ogLSBBMDE1NzA3NTgiCmRhdGU6ICIyMDI0LTAyLTE5IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6IAogICAgdG9jOiBUUlVFCiAgICB0b2NfZmxvYXQ6IFRSVUUKICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUKLS0tCgohW10oL1VzZXJzL3ZhbGVyaWFjYW50dWxvYm8vRG93bmxvYWRzL3Zpbm8uanBlZykKCiMgQ2x1c3RlcnMgQmFzaWMgR3VpZGUKU2VnbWVudGF0aW9uIG9yIGNsdXN0ZXJzIGlzIGEgc2V0IG9mIHRlY2huaXF1ZXMgd2hvc2UgcHVycG9zZSBpcyB0byBmb3JtIGdyb3VwcyBmcm9tIGEgc2V0IG9mIGVsZW1lbnRzLgoKCiMjIFN0ZXAgMSBJbnN0YWxsIGFuZCBjYWxsIGxpYnJhcmllcwpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoZGF0YS50YWJsZSkKbGlicmFyeShjbHVzdGVyKQpsaWJyYXJ5KGZhY3RvZXh0cmEpCmxpYnJhcnkocmVhZHIpCmxpYnJhcnkoZHBseXIpCmBgYAoKIyMgU3RlcCAyIE9idGFpbiBkYXRhCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmRmIDwtIHJlYWR4bDo6cmVhZF9leGNlbCgiL1VzZXJzL3ZhbGVyaWFjYW50dWxvYm8vRG93bmxvYWRzL3dpbmUueGxzeCIpCmRmCmBgYAoKIyMgU3RlcCAzIFVuZGVyc3RhbmQgdGhlIERhdGEKYGBge3J9CnN1bW1hcnkoZGYpCmBgYAoKIyMgU3RlcCA0IFNjYWxlIHRoZSBkYXRhCmBgYHtyfQpkZl9zY2FsZWQgPC0gYXMuZGF0YS5mcmFtZShzY2FsZShkZikpCmBgYAoKCiMjIFN0ZXAgNSBOdW1iZXIgb2YgQ2x1c3RlcnMKYGBge3J9Cmdyb3VwcyA8LSA0CmBgYAoKIyMgU3RlcCA2IEdlbmVyYXRlIENsdXN0ZXJzCmBgYHtyfQpzZWdtZW50cyA8LSBrbWVhbnMoZGZfc2NhbGVkLGdyb3VwcykKc2VnbWVudHMKYGBgCgojIyBTdGVwIDcgQXNzaWduIGRhdGEgdG8gQ2x1c3RlcnMKYGBge3J9CmFzc2lnbmF0aW9uIDwtIGNiaW5kKGRmLCBjbHVzdGVyID0gc2VnbWVudHMkY2x1c3RlcikKI2Fzc2lnbmF0aW9uCmBgYAoKIyMgU3RlcCA4IEdyYXBoIENsdXN0ZXJzCmBgYHtyfQpmdml6X2NsdXN0ZXIoc2VnbWVudHMsIGRhdGEgPSBkZl9zY2FsZWQpCmBgYAoKIyMgU3RlcCA5IE9wdGltaXplIG51bWJlciBvZiBjbHVzdGVycwpgYGB7cn0Kc2V0LnNlZWQoMTIzKQpvcHRpbWl6YXRpb24gPSBjbHVzR2FwKGRmX3NjYWxlZCwgRlVOPWttZWFucywgbnN0YXJ0PTEsIEsubWF4ID0gMTIpCnBsb3Qob3B0aW1pemF0aW9uLCB4bGFiPSJOdW1iZXIgb2YgayBjbHVzdGVyczogIikKYGBgCgojIyBTdGVwIDEwIFJ1biBpdCBiYWNrIHdpdGggdGhlIHJpZ2h0IG51bWJlciBvZiBDbHVzdGVycwpgYGB7cn0KIyAtLS0tLS0tLS0tLS0gUlVOIElUIEJBQ0sgV0lUSCBPUFRJTUlaRUQgQ0xVU1RFUiBOVU1CRVIgLS0tLS0tLS0tLS0tLS0tLS0KIyBTdGVwIDMgTnVtYmVyIG9mIENsdXN0ZXJzCmdyb3VwcyA8LSAzICNDaGFuZ2VkIHRvIG9wdGltaXplZCBudW1iZXIgb2YgY2x1c3RlcnMKCiMgU3RlcCA0IEdlbmVyYXRlIENsdXN0ZXJzCnNlZ21lbnRzIDwtIGttZWFucyhkZl9zY2FsZWQsZ3JvdXBzKQpzZWdtZW50cwoKIyBTdGVwIDUgQXNzaWduIGRhdGEgdG8gQ2x1c3RlcnMKYXNzaWduYXRpb24gPC0gY2JpbmQoZGYsIGNsdXN0ZXIgPSBzZWdtZW50cyRjbHVzdGVyKQojYXNzaWduYXRpb24KCiMgU3RlcCA2IEdyYXBoIENsdXN0ZXJzCmZ2aXpfY2x1c3RlcihzZWdtZW50cywgZGF0YSA9IGRmX3NjYWxlZCkKYGBgCgojIyBPYnNlcnZhdGlvbnMKU2VnbWVudGF0aW9uIG9yIGNsdXN0ZXJzIGFyZSBhIHVzZWZ1bCBhbGdvcml0aG0gZm9yIGNvbXBhbmllcyB0aGF0IHdhbnQgdG8gY2xhc3NpZnkgdGhlaXIgY3VzdG9tZXJzIGFuZCBkaXJlY3QgbW9yZSBmb2N1c2VkIGFuZCBzcGVjaWFsaXplZCBtYXJrZXRpbmcgY2FtcGFpZ25zLgoKIyMgU3RlcCAxMSBDbHVzdGVyIENsYXNzaWZpY2F0aW9uIEFuYWxpc2lzCmBgYHtyfQojIEdyb3VwaW5nIGJ5ICdhc3NpZ25hdGlvbiRjbHVzdGVyJyBhbmQgY2FsY3VsYXRpbmcgbWVhbiBvZiBhbGwgbnVtZXJpYyB2YXJpYWJsZXMKYW5hbGlzaXMgPC0gYXNzaWduYXRpb24gJT4lCiAgZ3JvdXBfYnkoYXNzaWduYXRpb24kY2x1c3RlcikgJT4lCiAgc3VtbWFyaXNlKGFjcm9zcyh3aGVyZShpcy5udW1lcmljKSwgbWVhbiwgbmEucm0gPSBUUlVFKSkKCiMgRGlzcGxheSB0aGUgcmVzdWx0CnByaW50KGFuYWxpc2lzKQpgYGAKCg==