Contexto

Estos datos son el resultado de un análisis químico de vinos cultivados en la misma región de Italia, pero derivados de 3 cultivares diferentes.

El análisis determinó las cantidades de 13 componentes que se encuentran en cada uno de los 3 cultivares.

Instalar paquetes y llamar librerías

#install.packages("cluster")  #Análisis de Agrupamiento
library(cluster)
#install.packages("ggplot2") #Graficar
library(ggplot2)
#install.packages("data.table") #Manejo de muchos datos 
library(data.table)
#install.packages("factoextra") #Gráfica optimizazción de número de clusters
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Paso2. Obtener los datos

df <- read.csv("C:\\Users\\Max\\Desktop\\UNI TEC\\7mo semestre\\modulo 2\\R\\segunda act wines\\wine.csv")

P.3 Entender los datos

summary(df)
##     Alcohol        Malic_Acid         Ash         Ash_Alcanity  
##  Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60  
##  1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20  
##  Median :13.05   Median :1.865   Median :2.360   Median :19.50  
##  Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49  
##  3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50  
##  Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00  
##    Magnesium      Total_Phenols     Flavanoids    Nonflavanoid_Phenols
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300      
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700      
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400      
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619      
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375      
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600      
##  Proanthocyanins Color_Intensity       Hue             OD280      
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270  
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938  
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780  
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612  
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170  
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000  
##     Proline      
##  Min.   : 278.0  
##  1st Qu.: 500.5  
##  Median : 673.5  
##  Mean   : 746.9  
##  3rd Qu.: 985.0  
##  Max.   :1680.0
str(df)
## 'data.frame':    178 obs. of  13 variables:
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_Acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Ash_Alcanity        : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_Phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_Phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_Intensity     : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280               : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

P.4 Escalar los datos

# Solo si los datos no estan en la misma escala.
df1 <- scale(df)

P.5 Determinar el numero de grupos

# Siempre es un valor inicial "cualquiera", luego se optimiza
plot(df1)

grupos1 <- 3

P.6 Generar los grupos

clusters1 <- kmeans(df1,grupos1)
clusters1
## K-means clustering with 3 clusters of sizes 51, 65, 62
## 
## Cluster means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.1644436  0.8690954  0.1863726    0.5228924 -0.07526047   -0.97657548
## 2 -0.9234669 -0.3929331 -0.4931257    0.1701220 -0.49032869   -0.07576891
## 3  0.8328826 -0.3029551  0.3636801   -0.6084749  0.57596208    0.88274724
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1 -1.21182921           0.72402116     -0.77751312       0.9388902 -1.1615122
## 2  0.02075402          -0.03343924      0.05810161      -0.8993770  0.4605046
## 3  0.97506900          -0.56050853      0.57865427       0.1705823  0.4726504
##        OD280    Proline
## 1 -1.2887761 -0.4059428
## 2  0.2700025 -0.7517257
## 3  0.7770551  1.1220202
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 3
##  [75] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 1 2 2 3 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 326.3537 558.6971 385.6983
##  (between_SS / total_SS =  44.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

P.7 Optimizar el numero de grupos

set.seed(123)
optimizacion <- clusGap(df1, FUN=kmeans, nstart=1, K.max=7)
# El k.max noramlmente es 10, en este ejercicio al ser 8 datos se dejo en 7
plot(optimizacion, xlab="Numero de clusters k")

# Se selecciona como optimo el primer punto mas alto.

P.8 Graficar los grupos

fviz_cluster(clusters1, data=df1)

P.9 Agregar Clusters a la base de datos

df1_clusters <- cbind(df, cluster = clusters1$cluster)
head(df1_clusters)
##   Alcohol Malic_Acid  Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1   14.23       1.71 2.43         15.6       127          2.80       3.06
## 2   13.20       1.78 2.14         11.2       100          2.65       2.76
## 3   13.16       2.36 2.67         18.6       101          2.80       3.24
## 4   14.37       1.95 2.50         16.8       113          3.85       3.49
## 5   13.24       2.59 2.87         21.0       118          2.80       2.69
## 6   14.20       1.76 2.45         15.2       112          3.27       3.39
##   Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue OD280 Proline
## 1                 0.28            2.29            5.64 1.04  3.92    1065
## 2                 0.26            1.28            4.38 1.05  3.40    1050
## 3                 0.30            2.81            5.68 1.03  3.17    1185
## 4                 0.24            2.18            7.80 0.86  3.45    1480
## 5                 0.39            1.82            4.32 1.04  2.93     735
## 6                 0.34            1.97            6.75 1.05  2.85    1450
##   cluster
## 1       3
## 2       3
## 3       3
## 4       3
## 5       3
## 6       3

Conclusiones

La tecnica de clustering permite identificar patrones o grupos naturales en los datos sin necesidad de etiquetas privadas

LS0tDQp0aXRsZTogImNsdXN0ZXJzIHZpbm9zIg0KYXV0aG9yOiAiTWF4IFZpZGFsIg0KZGF0ZTogImByIFN5cy5EYXRlKClgIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCiAgICB0aGVtZTogY29zbW8NCi0tLQ0KIVtdKGh0dHBzOi8vaS5waW5pbWcuY29tL29yaWdpbmFscy9lOS9jZS9jYS9lOWNlY2EyZTYxZWE1OTFiNDk4NmU5OTdmOWE2YjcyZC5naWYpDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWV2aW9sZXQ7Ij4gQ29udGV4dG8gPC9zcGFuPg0KDQpFc3RvcyBkYXRvcyBzb24gZWwgcmVzdWx0YWRvIGRlIHVuIGFuw6FsaXNpcyBxdcOtbWljbyBkZSB2aW5vcyBjdWx0aXZhZG9zIGVuIGxhIG1pc21hIHJlZ2nDs24gZGUgSXRhbGlhLCBwZXJvIGRlcml2YWRvcyBkZSAzIGN1bHRpdmFyZXMgZGlmZXJlbnRlcy4gIA0KDQpFbCBhbsOhbGlzaXMgZGV0ZXJtaW7DsyBsYXMgY2FudGlkYWRlcyBkZSAxMyBjb21wb25lbnRlcyBxdWUgc2UgZW5jdWVudHJhbiBlbiBjYWRhIHVubyBkZSBsb3MgMyBjdWx0aXZhcmVzLiAgDQoNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZXZpb2xldDsiPiBJbnN0YWxhciBwYXF1ZXRlcyB5IGxsYW1hciBsaWJyZXLDrWFzIDwvc3Bhbj4NCg0KPC9zcGFuPg0KDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJjbHVzdGVyIikgICNBbsOhbGlzaXMgZGUgQWdydXBhbWllbnRvDQpsaWJyYXJ5KGNsdXN0ZXIpDQojaW5zdGFsbC5wYWNrYWdlcygiZ2dwbG90MiIpICNHcmFmaWNhcg0KbGlicmFyeShnZ3Bsb3QyKQ0KI2luc3RhbGwucGFja2FnZXMoImRhdGEudGFibGUiKSAjTWFuZWpvIGRlIG11Y2hvcyBkYXRvcyANCmxpYnJhcnkoZGF0YS50YWJsZSkNCiNpbnN0YWxsLnBhY2thZ2VzKCJmYWN0b2V4dHJhIikgI0dyw6FmaWNhIG9wdGltaXphemNpw7NuIGRlIG7Dum1lcm8gZGUgY2x1c3RlcnMNCmxpYnJhcnkoZmFjdG9leHRyYSkNCmBgYA0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVldmlvbGV0OyI+IFBhc28yLiBPYnRlbmVyIGxvcyBkYXRvcyA8L3NwYW4+DQpgYGB7cn0NCmRmIDwtIHJlYWQuY3N2KCJDOlxcVXNlcnNcXE1heFxcRGVza3RvcFxcVU5JIFRFQ1xcN21vIHNlbWVzdHJlXFxtb2R1bG8gMlxcUlxcc2VndW5kYSBhY3Qgd2luZXNcXHdpbmUuY3N2IikNCmBgYA0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpvcmFuZ2U7Ij4gUC4zIEVudGVuZGVyIGxvcyBkYXRvcyA8L3NwYW4+DQpgYGB7cn0NCnN1bW1hcnkoZGYpDQpzdHIoZGYpDQpgYGANCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6b3JhbmdlOyI+IFAuNCBFc2NhbGFyIGxvcyBkYXRvcyA8L3NwYW4+DQpgYGB7cn0NCiMgU29sbyBzaSBsb3MgZGF0b3Mgbm8gZXN0YW4gZW4gbGEgbWlzbWEgZXNjYWxhLg0KZGYxIDwtIHNjYWxlKGRmKQ0KYGBgDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOm9yYW5nZTsiPiBQLjUgRGV0ZXJtaW5hciBlbCBudW1lcm8gZGUgZ3J1cG9zIDwvc3Bhbj4NCmBgYHtyfQ0KIyBTaWVtcHJlIGVzIHVuIHZhbG9yIGluaWNpYWwgImN1YWxxdWllcmEiLCBsdWVnbyBzZSBvcHRpbWl6YQ0KcGxvdChkZjEpDQpncnVwb3MxIDwtIDMNCmBgYA0KIyA8c3BhbiBzdHlsZT0iY29sb3I6b3JhbmdlOyI+IFAuNiBHZW5lcmFyIGxvcyBncnVwb3MgPC9zcGFuPg0KYGBge3J9DQpjbHVzdGVyczEgPC0ga21lYW5zKGRmMSxncnVwb3MxKQ0KY2x1c3RlcnMxDQpgYGANCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6b3JhbmdlOyI+IFAuNyBPcHRpbWl6YXIgZWwgbnVtZXJvIGRlIGdydXBvcyA8L3NwYW4+DQpgYGB7cn0NCnNldC5zZWVkKDEyMykNCm9wdGltaXphY2lvbiA8LSBjbHVzR2FwKGRmMSwgRlVOPWttZWFucywgbnN0YXJ0PTEsIEsubWF4PTcpDQojIEVsIGsubWF4IG5vcmFtbG1lbnRlIGVzIDEwLCBlbiBlc3RlIGVqZXJjaWNpbyBhbCBzZXIgOCBkYXRvcyBzZSBkZWpvIGVuIDcNCnBsb3Qob3B0aW1pemFjaW9uLCB4bGFiPSJOdW1lcm8gZGUgY2x1c3RlcnMgayIpDQojIFNlIHNlbGVjY2lvbmEgY29tbyBvcHRpbW8gZWwgcHJpbWVyIHB1bnRvIG1hcyBhbHRvLg0KYGBgDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpvcmFuZ2U7Ij4gUC44IEdyYWZpY2FyIGxvcyBncnVwb3MgPC9zcGFuPg0KYGBge3J9DQpmdml6X2NsdXN0ZXIoY2x1c3RlcnMxLCBkYXRhPWRmMSkNCmBgYA0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpvcmFuZ2U7Ij4gUC45IEFncmVnYXIgQ2x1c3RlcnMgYSBsYSBiYXNlIGRlIGRhdG9zIDwvc3Bhbj4NCmBgYHtyfQ0KZGYxX2NsdXN0ZXJzIDwtIGNiaW5kKGRmLCBjbHVzdGVyID0gY2x1c3RlcnMxJGNsdXN0ZXIpDQpoZWFkKGRmMV9jbHVzdGVycykNCmBgYA0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjpvcmFuZ2U7Ij4gQ29uY2x1c2lvbmVzIDwvc3Bhbj4NCkxhIHRlY25pY2EgZGUgKmNsdXN0ZXJpbmcqIHBlcm1pdGUgaWRlbnRpZmljYXIgcGF0cm9uZXMgbyBncnVwb3MgbmF0dXJhbGVzIGVuIGxvcyBkYXRvcyBzaW4gbmVjZXNpZGFkIGRlIGV0aXF1ZXRhcyBwcml2YWRhcw0KDQoNCg0K