
# <span style="color: magenta;">Contexto</span>
# Estos datos son el resultado de un análisis químico de vinos cultivados en la misma región de Italia pero derivados de tres cultivares diferentes.
# El análisis determinó las cantidades de 13 componentes que se encuentran en cada uno de los tres tipos de vinos.
# Fuente:
# [Wine dataset](https://search.r-project.org/CRAN/refmans/HDclassif/html/wine.html)
# <span style="color: magenta;">Paso 1. Instalar paquetes y llamar librerías</span>
# install.packages("cluster")
library(cluster)
# install.packages("ggplot2")
library(ggplot2)
# install.packages("data.table")
library(data.table)
# install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# <span style="color: magenta;">Paso 2. Obtener los datos</span>
datos <- read.csv("C:\\Users\\lcbor\\Downloads\\wine.csv")
# <span style="color: magenta;">Paso 3. Entender la base de datos</span>
summary(datos)
## Alcohol Malic_Acid Ash Ash_Alcanity
## Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color_Intensity Hue OD280
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
# <span style="color: magenta;">Paso 4. Escalar la base de datos</span>
df <- scale(datos)
# <span style="color: magenta;">Paso 5. Cantidad de grupos</span>
grupos <- 3
# <span style="color: magenta;">Paso 6. Generar los segmentos</span>
segmentos <- kmeans(df,grupos)
segmentos
## K-means clustering with 3 clusters of sizes 62, 65, 51
##
## Cluster means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 2 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## 3 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 2 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## 3 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## OD280 Proline
## 1 0.7770551 1.1220202
## 2 0.2700025 -0.7517257
## 3 -1.2887761 -0.4059428
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 1
## [75] 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 3 2 2 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 385.6983 558.6971 326.3537
## (between_SS / total_SS = 44.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# <span style="color: magenta;">Paso 7. Asignar el grupo al que pertenece cada observación</span>
asignacion <- cbind(datos, cluster = segmentos$cluster)
# asignacion
# <span style="color: magenta;">Paso 8. Graficar los clusters</span>
fviz_cluster(segmentos, data=df)

# <span style="color: magenta;">Paso 9. Optimizar la cantidad de grupos</span>
# La cantidad óptima de grupos corresponde al punto más alto de la siguiente gráfica.
set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max =10)
plot(optimizacion, xlab="Número de clusters k")

# <span style="color: magenta;">Paso 10. Comparar segmentos </span>
promedio <- aggregate(asignacion, by=list(asignacion$cluster), FUN=mean)
promedio
## Group.1 Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 1 13.67677 1.997903 2.466290 17.46290 107.96774 2.847581
## 2 2 12.25092 1.897385 2.231231 20.06308 92.73846 2.247692
## 3 3 13.13412 3.307255 2.417647 21.24118 98.66667 1.683922
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 3.0032258 0.2920968 1.922097 5.453548 1.0654839
## 2 2.0500000 0.3576923 1.624154 2.973077 1.0627077
## 3 0.8188235 0.4519608 1.145882 7.234706 0.6919608
## OD280 Proline cluster
## 1 3.163387 1100.2258 1
## 2 2.803385 510.1692 2
## 3 1.696667 619.0588 3
# <span style="color: magenta;">Conclusión</span>
# La **segmentación** o *clusters* es un algoritmo útil para identificar el cultivar correspondiente a cada vino.
LS0tDQp0aXRsZTogIlNlZ21lbnRhY2lvbiBkZSBWaW5vcyINCmF1dGhvcjogIkx1aXMgQ2FybG9zIEJvcmJvbiBNYXJ0aW5leiINCmRhdGU6ICIyMDI0LTA4LTEyIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogdHJ1ZQ0KICAgIHRvY19mb2F0OiB0cnVlDQogICAgY29kZV9kb3dubG9hZDogdHJ1ZQ0KICAgIHRoZW1lOiAiY29zbW8iDQotLS0NCg0KIVtdKEM6XFxVc2Vyc1xcbGNib3JcXERvY3VtZW50c1xccmVkLXdpbmUtcG91cmluZy1mcm9tLWJvdHRsZS1nbGFzcy53aWR0aC0xOTIwLmpwZykNCg0KYGBge3J9DQojIDxzcGFuIHN0eWxlPSJjb2xvcjogbWFnZW50YTsiPkNvbnRleHRvPC9zcGFuPiANCiMgRXN0b3MgZGF0b3Mgc29uIGVsIHJlc3VsdGFkbyBkZSB1biBhbsOhbGlzaXMgcXXDrW1pY28gZGUgdmlub3MgY3VsdGl2YWRvcyBlbiBsYSBtaXNtYSByZWdpw7NuIGRlIEl0YWxpYSBwZXJvIGRlcml2YWRvcyBkZSB0cmVzIGN1bHRpdmFyZXMgZGlmZXJlbnRlcy4gIA0KDQojIEVsIGFuw6FsaXNpcyBkZXRlcm1pbsOzIGxhcyBjYW50aWRhZGVzIGRlIDEzIGNvbXBvbmVudGVzIHF1ZSBzZSBlbmN1ZW50cmFuIGVuIGNhZGEgdW5vIGRlIGxvcyB0cmVzIHRpcG9zIGRlIHZpbm9zLg0KDQojIEZ1ZW50ZTogIA0KIyBbV2luZSBkYXRhc2V0XShodHRwczovL3NlYXJjaC5yLXByb2plY3Qub3JnL0NSQU4vcmVmbWFucy9IRGNsYXNzaWYvaHRtbC93aW5lLmh0bWwpDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBtYWdlbnRhOyI+UGFzbyAxLiBJbnN0YWxhciBwYXF1ZXRlcyB5IGxsYW1hciBsaWJyZXLDrWFzPC9zcGFuPiANCg0KIyBpbnN0YWxsLnBhY2thZ2VzKCJjbHVzdGVyIikNCmxpYnJhcnkoY2x1c3RlcikNCiMgaW5zdGFsbC5wYWNrYWdlcygiZ2dwbG90MiIpDQpsaWJyYXJ5KGdncGxvdDIpDQojIGluc3RhbGwucGFja2FnZXMoImRhdGEudGFibGUiKQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KIyBpbnN0YWxsLnBhY2thZ2VzKCJmYWN0b2V4dHJhIikNCmxpYnJhcnkoZmFjdG9leHRyYSkNCg0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogbWFnZW50YTsiPlBhc28gMi4gT2J0ZW5lciBsb3MgZGF0b3M8L3NwYW4+IA0KZGF0b3MgPC0gcmVhZC5jc3YoIkM6XFxVc2Vyc1xcbGNib3JcXERvd25sb2Fkc1xcd2luZS5jc3YiKQ0KDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBtYWdlbnRhOyI+UGFzbyAzLiBFbnRlbmRlciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPiANCnN1bW1hcnkoZGF0b3MpDQoNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG1hZ2VudGE7Ij5QYXNvIDQuIEVzY2FsYXIgbGEgYmFzZSBkZSBkYXRvczwvc3Bhbj4gDQpkZiA8LSBzY2FsZShkYXRvcykNCg0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogbWFnZW50YTsiPlBhc28gNS4gQ2FudGlkYWQgZGUgZ3J1cG9zPC9zcGFuPiANCmdydXBvcyA8LSAzDQoNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG1hZ2VudGE7Ij5QYXNvIDYuIEdlbmVyYXIgbG9zIHNlZ21lbnRvczwvc3Bhbj4gDQpzZWdtZW50b3MgPC0ga21lYW5zKGRmLGdydXBvcykNCnNlZ21lbnRvcw0KDQoNCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBtYWdlbnRhOyI+UGFzbyA3LiBBc2lnbmFyIGVsIGdydXBvIGFsIHF1ZSBwZXJ0ZW5lY2UgY2FkYSBvYnNlcnZhY2nDs248L3NwYW4+IA0KDQphc2lnbmFjaW9uIDwtIGNiaW5kKGRhdG9zLCBjbHVzdGVyID0gc2VnbWVudG9zJGNsdXN0ZXIpDQojIGFzaWduYWNpb24NCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG1hZ2VudGE7Ij5QYXNvIDguIEdyYWZpY2FyIGxvcyBjbHVzdGVyczwvc3Bhbj4gDQoNCmZ2aXpfY2x1c3RlcihzZWdtZW50b3MsIGRhdGE9ZGYpDQoNCg0KIyA8c3BhbiBzdHlsZT0iY29sb3I6IG1hZ2VudGE7Ij5QYXNvIDkuIE9wdGltaXphciBsYSBjYW50aWRhZCBkZSBncnVwb3M8L3NwYW4+IA0KIyBMYSBjYW50aWRhZCDDs3B0aW1hIGRlIGdydXBvcyBjb3JyZXNwb25kZSBhbCBwdW50byBtw6FzIGFsdG8gZGUgbGEgc2lndWllbnRlIGdyw6FmaWNhLg0KDQpzZXQuc2VlZCgxMjMpDQpvcHRpbWl6YWNpb24gPC0gY2x1c0dhcChkZiwgRlVOPWttZWFucywgbnN0YXJ0PTEsIEsubWF4ID0xMCkNCnBsb3Qob3B0aW1pemFjaW9uLCB4bGFiPSJOw7ptZXJvIGRlIGNsdXN0ZXJzIGsiKQ0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogbWFnZW50YTsiPlBhc28gMTAuIENvbXBhcmFyIHNlZ21lbnRvcyA8L3NwYW4+IA0KDQpwcm9tZWRpbyA8LSBhZ2dyZWdhdGUoYXNpZ25hY2lvbiwgYnk9bGlzdChhc2lnbmFjaW9uJGNsdXN0ZXIpLCBGVU49bWVhbikNCnByb21lZGlvDQoNCg0KDQojIDxzcGFuIHN0eWxlPSJjb2xvcjogbWFnZW50YTsiPkNvbmNsdXNpw7NuPC9zcGFuPiAgDQojIExhICoqc2VnbWVudGFjacOzbioqIG8gKmNsdXN0ZXJzKiBlcyB1biBhbGdvcml0bW8gw7p0aWwgcGFyYSBpZGVudGlmaWNhciBlbCBjdWx0aXZhciBjb3JyZXNwb25kaWVudGUgYSBjYWRhIHZpbm8uDQoNCmBgYA==