Segmentation or clusters is a set of techniques whose purpose is to form groups from a set of elements.
library(ggplot2)
library(data.table)
library(cluster)
library(factoextra)
library(readr)
library(dplyr)
df <- readxl::read_excel("/Users/valeriacantulobo/Downloads/wine.xlsx")
df
## # A tibble: 178 × 13
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 14.2 1.71 2.43 15.6 127 2.8 3.06
## 2 13.2 1.78 2.14 11.2 100 2.65 2.76
## 3 13.2 2.36 2.67 18.6 101 2.8 3.24
## 4 14.4 1.95 2.5 16.8 113 3.85 3.49
## 5 13.2 2.59 2.87 21 118 2.8 2.69
## 6 14.2 1.76 2.45 15.2 112 3.27 3.39
## 7 14.4 1.87 2.45 14.6 96 2.5 2.52
## 8 14.1 2.15 2.61 17.6 121 2.6 2.51
## 9 14.8 1.64 2.17 14 97 2.8 2.98
## 10 13.9 1.35 2.27 16 98 2.98 3.15
## # ℹ 168 more rows
## # ℹ 6 more variables: Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>,
## # Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>, Proline <dbl>
summary(df)
## Alcohol Malic_Acid Ash Ash_Alcanity
## Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color_Intensity Hue OD280
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
df_scaled <- as.data.frame(scale(df))
groups <- 4
segments <- kmeans(df_scaled,groups)
segments
## K-means clustering with 4 clusters of sizes 59, 40, 50, 29
##
## Cluster means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.9028300 -0.29698179 0.2925626 -0.6927815 0.55567318 0.89175848
## 2 -1.0148325 -0.08176935 0.2496251 0.7224166 -0.51927755 -0.03732895
## 3 0.1766166 0.90395669 0.2153615 0.5494898 -0.07712756 -0.98731537
## 4 -0.7415344 -0.84155637 -1.3108371 -0.5343809 -0.28128407 -0.06051117
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 0.9514773 -0.6073179 0.5966082 0.1931051 0.4694211
## 2 0.1901974 0.2723596 0.2622500 -0.9505255 0.3185023
## 3 -1.2236663 0.7114800 -0.7591372 0.9516989 -1.1867156
## 4 -0.0883360 -0.3667803 -0.2666562 -0.7226595 0.6517187
## OD280 Proline
## 1 0.7749105 1.1750720
## 2 0.3736876 -0.8357003
## 3 -1.2857714 -0.3952058
## 4 0.1248741 -0.5565843
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 2 2 4 4 4 4 4 2 2 1
## [75] 4 4 4 4 4 2 4 4 2 3 2 4 2 2 2 2 2 2 2 2 4 1 2 4 4 2 4 4 2 4 4 2 4 2 4 2 2
## [112] 2 2 2 2 2 4 2 3 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 322.0013 332.1551 314.6524 211.4465
## (between_SS / total_SS = 48.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
assignation <- cbind(df, cluster = segments$cluster)
#assignation
fviz_cluster(segments, data = df_scaled)
set.seed(123)
optimization = clusGap(df_scaled, FUN=kmeans, nstart=1, K.max = 12)
plot(optimization, xlab="Number of k clusters: ")
# ------------ RUN IT BACK WITH OPTIMIZED CLUSTER NUMBER -----------------
# Step 3 Number of Clusters
groups <- 3 #Changed to optimized number of clusters
# Step 4 Generate Clusters
segments <- kmeans(df_scaled,groups)
segments
## K-means clustering with 3 clusters of sizes 51, 62, 65
##
## Cluster means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## 2 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## 2 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 3 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## OD280 Proline
## 1 -1.2887761 -0.4059428
## 2 0.7770551 1.1220202
## 3 0.2700025 -0.7517257
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2
## [75] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 1 3 3 2 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 326.3537 385.6983 558.6971
## (between_SS / total_SS = 44.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Step 5 Assign data to Clusters
assignation <- cbind(df, cluster = segments$cluster)
#assignation
# Step 6 Graph Clusters
fviz_cluster(segments, data = df_scaled)
Segmentation or clusters are a useful algorithm for companies that want to classify their customers and direct more focused and specialized marketing campaigns.
# Grouping by 'assignation$cluster' and calculating mean of all numeric variables
analisis <- assignation %>%
group_by(assignation$cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `assignation$cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# Display the result
print(analisis)
## # A tibble: 3 × 15
## `assignation$cluster` Alcohol Malic_Acid Ash Ash_Alcanity Magnesium
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 13.1 3.31 2.42 21.2 98.7
## 2 2 13.7 2.00 2.47 17.5 108.
## 3 3 12.3 1.90 2.23 20.1 92.7
## # ℹ 9 more variables: Total_Phenols <dbl>, Flavanoids <dbl>,
## # Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>, Color_Intensity <dbl>,
## # Hue <dbl>, OD280 <dbl>, Proline <dbl>, cluster <dbl>