Teoría

Estos datos son el resultado de un análisis de vinos cultivados en la misma región de italia pero derivados de tres cultivares diferentes.

Instalar paquetes y llamar librerias

#install.packages("cluster")
library(cluster)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("data.table")
library(data.table)
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Paso 2. Obtener los datos

df1 <-read_csv("C:/Users/anama/Downloads/wine.csv")
## Rows: 178 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): Alcohol, Malic_Acid, Ash, Ash_Alcanity, Magnesium, Total_Phenols, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Paso 3. Entender los datos

summary(df1)
##     Alcohol        Malic_Acid         Ash         Ash_Alcanity  
##  Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60  
##  1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20  
##  Median :13.05   Median :1.865   Median :2.360   Median :19.50  
##  Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49  
##  3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50  
##  Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00  
##    Magnesium      Total_Phenols     Flavanoids    Nonflavanoid_Phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  Proanthocyanins Color_Intensity       Hue             OD280      
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270  
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938  
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780  
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612  
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170  
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000  
##     Proline      
##  Min.   : 278.0  
##  1st Qu.: 500.5  
##  Median : 673.5  
##  Mean   : 746.9  
##  3rd Qu.: 985.0  
##  Max.   :1680.0
str(df1)
## spc_tbl_ [178 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Alcohol             : num [1:178] 14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_Acid          : num [1:178] 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num [1:178] 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Ash_Alcanity        : num [1:178] 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : num [1:178] 127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_Phenols       : num [1:178] 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num [1:178] 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_Phenols: num [1:178] 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num [1:178] 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_Intensity     : num [1:178] 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num [1:178] 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280               : num [1:178] 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : num [1:178] 1065 1050 1185 1480 735 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Alcohol = col_double(),
##   ..   Malic_Acid = col_double(),
##   ..   Ash = col_double(),
##   ..   Ash_Alcanity = col_double(),
##   ..   Magnesium = col_double(),
##   ..   Total_Phenols = col_double(),
##   ..   Flavanoids = col_double(),
##   ..   Nonflavanoid_Phenols = col_double(),
##   ..   Proanthocyanins = col_double(),
##   ..   Color_Intensity = col_double(),
##   ..   Hue = col_double(),
##   ..   OD280 = col_double(),
##   ..   Proline = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Paso 4.Escalar los datos

datos_escalados <-scale(df1) 

Paso 5.Determinar el número de grupos

#Siempre es un valor inicial "cualquiera", luego se optimiza
grupos1<-3

Paso 6. Generar los grupos

set.seed(123)
clusters1 <- kmeans(datos_escalados,grupos1)
clusters1
## K-means clustering with 3 clusters of sizes 51, 62, 65
## 
## Cluster means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.1644436  0.8690954  0.1863726    0.5228924 -0.07526047   -0.97657548
## 2  0.8328826 -0.3029551  0.3636801   -0.6084749  0.57596208    0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257    0.1701220 -0.49032869   -0.07576891
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1 -1.21182921           0.72402116     -0.77751312       0.9388902 -1.1615122
## 2  0.97506900          -0.56050853      0.57865427       0.1705823  0.4726504
## 3  0.02075402          -0.03343924      0.05810161      -0.8993770  0.4605046
##        OD280    Proline
## 1 -1.2887761 -0.4059428
## 2  0.7770551  1.1220202
## 3  0.2700025 -0.7517257
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2
##  [75] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 1 3 3 2 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 326.3537 385.6983 558.6971
##  (between_SS / total_SS =  44.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Paso 7. Optimizar el número de grupos

set.seed(123)
optimizacion <-clusGap(datos_escalados,FUN=kmeans, nstart=1, K.max=10)
plot(optimizacion,xlab="Número de clusters K")

# Paso 8. Graficar los grupos

fviz_cluster(clusters1, data=datos_escalados)

# Paso 9. Agregar clusters a la base de datos

escalados_clusters <- cbind(datos_escalados, cluster = clusters1$cluster)
head(escalados_clusters)
##        Alcohol  Malic_Acid        Ash Ash_Alcanity  Magnesium Total_Phenols
## [1,] 1.5143408 -0.56066822  0.2313998   -1.1663032 1.90852151     0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672   -2.4838405 0.01809398     0.5670481
## [3,] 0.1963252  0.02117152  1.1062139   -0.2679823 0.08810981     0.8067217
## [4,] 1.6867914 -0.34583508  0.4865539   -0.8069748 0.92829983     2.4844372
## [5,] 0.2948684  0.22705328  1.8352256    0.4506745 1.27837900     0.8067217
## [6,] 1.4773871 -0.51591132  0.3043010   -1.2860793 0.85828399     1.5576991
##      Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## [1,]  1.0319081           -0.6577078       1.2214385       0.2510088  0.3611585
## [2,]  0.7315653           -0.8184106      -0.5431887      -0.2924962  0.4049085
## [3,]  1.2121137           -0.4970050       2.1299594       0.2682629  0.3174085
## [4,]  1.4623994           -0.9791134       1.0292513       1.1827317 -0.4263410
## [5,]  0.6614853            0.2261576       0.4002753      -0.3183774  0.3611585
## [6,]  1.3622851           -0.1755994       0.6623487       0.7298108  0.4049085
##          OD280     Proline cluster
## [1,] 1.8427215  1.01015939       2
## [2,] 1.1103172  0.96252635       2
## [3,] 0.7863692  1.39122370       2
## [4,] 1.1807407  2.32800680       2
## [5,] 0.4483365 -0.03776747       2
## [6,] 0.3356589  2.23274072       2

Conclusiones

La técnica de clustering permite identificar patrones o grupos naturales en los datos sin necesidad de etiquetas previsas.

LS0tDQp0aXRsZTogIlZpbm9zIg0KYXV0aG9yOiAiQW5hIEdvbnrDoWxleiBBMDA4MzU1MTIiDQpkYXRlOiAiMjAyNS0wOC0xOCINCm91dHB1dDogDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUUlVFDQogICAgY29kZV9kb3dubG9hZDogVFJVRQ0KICAgIHRoZW1lOiB5ZXRpDQotLS0NCg0KIVtdKGh0dHBzOi8vaS5waW5pbWcuY29tL29yaWdpbmFscy83MS8wYy8wOS83MTBjMDkzZjNkY2Q0N2YyZGY2ZjEyM2U4MTAyOGQwZC5naWYpDQoNCg0KDQoNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPlRlb3LDrWEgPC9zcGFuPg0KRXN0b3MgZGF0b3Mgc29uIGVsIHJlc3VsdGFkbyBkZSB1biBhbsOhbGlzaXMgZGUgdmlub3MgY3VsdGl2YWRvcyBlbiBsYSBtaXNtYSByZWdpw7NuIGRlIGl0YWxpYSBwZXJvIGRlcml2YWRvcyBkZSB0cmVzIGN1bHRpdmFyZXMgZGlmZXJlbnRlcy4NCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPkluc3RhbGFyIHBhcXVldGVzIHkgbGxhbWFyIGxpYnJlcmlhcyA8L3NwYW4+DQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJjbHVzdGVyIikNCmxpYnJhcnkoY2x1c3RlcikNCiNpbnN0YWxsLnBhY2thZ2VzKCJnZ3Bsb3QyIikNCmxpYnJhcnkoZ2dwbG90MikNCiNpbnN0YWxsLnBhY2thZ2VzKCJkYXRhLnRhYmxlIikNCmxpYnJhcnkoZGF0YS50YWJsZSkNCiNpbnN0YWxsLnBhY2thZ2VzKCJmYWN0b2V4dHJhIikNCmxpYnJhcnkoZmFjdG9leHRyYSkNCmxpYnJhcnkodGlkeXZlcnNlKQ0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+UGFzbyAyLiBPYnRlbmVyIGxvcyBkYXRvcyA8L3NwYW4+DQpgYGB7cn0NCmRmMSA8LXJlYWRfY3N2KCJDOi9Vc2Vycy9hbmFtYS9Eb3dubG9hZHMvd2luZS5jc3YiKQ0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+UGFzbyAzLiBFbnRlbmRlciBsb3MgZGF0b3MgPC9zcGFuPg0KYGBge3J9DQpzdW1tYXJ5KGRmMSkNCnN0cihkZjEpDQpgYGANCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWU7Ij5QYXNvIDQuRXNjYWxhciBsb3MgZGF0b3MgPC9zcGFuPg0KYGBge3J9DQpkYXRvc19lc2NhbGFkb3MgPC1zY2FsZShkZjEpIA0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+UGFzbyA1LkRldGVybWluYXIgZWwgbsO6bWVybyBkZSBncnVwb3MgPC9zcGFuPg0KYGBge3J9DQojU2llbXByZSBlcyB1biB2YWxvciBpbmljaWFsICJjdWFscXVpZXJhIiwgbHVlZ28gc2Ugb3B0aW1pemENCmdydXBvczE8LTMNCmBgYA0KIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPlBhc28gNi4gR2VuZXJhciBsb3MgZ3J1cG9zIDwvc3Bhbj4NCmBgYHtyfQ0Kc2V0LnNlZWQoMTIzKQ0KY2x1c3RlcnMxIDwtIGttZWFucyhkYXRvc19lc2NhbGFkb3MsZ3J1cG9zMSkNCmNsdXN0ZXJzMQ0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+UGFzbyA3LiBPcHRpbWl6YXIgZWwgbsO6bWVybyBkZSBncnVwb3MgPC9zcGFuPg0KYGBge3J9DQpzZXQuc2VlZCgxMjMpDQpvcHRpbWl6YWNpb24gPC1jbHVzR2FwKGRhdG9zX2VzY2FsYWRvcyxGVU49a21lYW5zLCBuc3RhcnQ9MSwgSy5tYXg9MTApDQpwbG90KG9wdGltaXphY2lvbix4bGFiPSJOw7ptZXJvIGRlIGNsdXN0ZXJzIEsiKQ0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+UGFzbyA4LiBHcmFmaWNhciBsb3MgZ3J1cG9zIDwvc3Bhbj4NCmBgYHtyfQ0KZnZpel9jbHVzdGVyKGNsdXN0ZXJzMSwgZGF0YT1kYXRvc19lc2NhbGFkb3MpDQpgYGANCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWU7Ij5QYXNvIDkuIEFncmVnYXIgY2x1c3RlcnMgYSBsYSBiYXNlIGRlIGRhdG9zIDwvc3Bhbj4NCg0KYGBge3J9DQplc2NhbGFkb3NfY2x1c3RlcnMgPC0gY2JpbmQoZGF0b3NfZXNjYWxhZG9zLCBjbHVzdGVyID0gY2x1c3RlcnMxJGNsdXN0ZXIpDQpoZWFkKGVzY2FsYWRvc19jbHVzdGVycykNCmBgYA0KIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPiBDb25jbHVzaW9uZXMgPC9zcGFuPg0KTGEgdMOpY25pY2EgZGUgKmNsdXN0ZXJpbmcqIHBlcm1pdGUgaWRlbnRpZmljYXIgcGF0cm9uZXMgbyBncnVwb3MgbmF0dXJhbGVzIGVuIGxvcyBkYXRvcyBzaW4gbmVjZXNpZGFkIGRlIGV0aXF1ZXRhcyBwcmV2aXNhcy4NCg0KDQo=