Estos datos son el resultado de un anĂ¡lisis quĂmico de vinos cultivados en la misma regiĂ³n de Italia, pero derivados de tres cultivares diferentes.
El anĂ¡lisis determinĂ³ las cantidades de 13 componentes que se encuentran en cada uno de los tres culti
#install.packages("cluster") # AnĂ¡lisis de Agrupamiento
library(cluster)
#install.packages("ggplot2") # Graficar
library(ggplot2)
#install.packages("data.table") # Manejo de muchos datos
library(data.table)
#install.packages("factoextra") # Grafica optimizaciĂ³n de nĂºmero de clusters
library(factoextra)
df2 <- read.csv("~/Library/CloudStorage/OneDrive-InstitutoTecnologicoydeEstudiosSuperioresdeMonterrey/SEM 7/M2/wine.csv")
summary(df2) # min, promedio, max, NAs
## Alcohol Malic_Acid Ash Ash_Alcanity
## Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color_Intensity Hue OD280
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
str(df2) # estructura: tipo de objeto, nĂºmero de objetos
## 'data.frame': 178 obs. of 13 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Ash_Alcanity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_Phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_Intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# SĂ³lo si los datos no estĂ¡n en la misma escala
df2e <- scale(df2)
# Siempre es un valor inicial "cualquiera", luego se optimiza.
grupos1 <- 3 # son 3 cultivares
set.seed(123)
clusters2 <- kmeans(df2e, grupos1)
clusters2
## K-means clustering with 3 clusters of sizes 51, 62, 65
##
## Cluster means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## 2 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## 2 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 3 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## OD280 Proline
## 1 -1.2887761 -0.4059428
## 2 0.7770551 1.1220202
## 3 0.2700025 -0.7517257
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2
## [75] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 1 3 3 2 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 326.3537 385.6983 558.6971
## (between_SS / total_SS = 44.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(123)
optimizacion2 <- clusGap(df2e, FUN=kmeans, nstart=1, K.max=10)
plot(optimizacion2, xlab="NĂºmero de clusters k", main="OptimizaciĂ³n de Clusters") #el primer punto mĂ¡s alto se selecciona como Ă³ptimo
fviz_cluster(clusters2, data=df2e)
df2e_clusters <- cbind(df2e, cluster = clusters2$cluster)
head(df2e_clusters)
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## [1,] 1.5143408 -0.56066822 0.2313998 -1.1663032 1.90852151 0.8067217
## [2,] 0.2455968 -0.49800856 -0.8256672 -2.4838405 0.01809398 0.5670481
## [3,] 0.1963252 0.02117152 1.1062139 -0.2679823 0.08810981 0.8067217
## [4,] 1.6867914 -0.34583508 0.4865539 -0.8069748 0.92829983 2.4844372
## [5,] 0.2948684 0.22705328 1.8352256 0.4506745 1.27837900 0.8067217
## [6,] 1.4773871 -0.51591132 0.3043010 -1.2860793 0.85828399 1.5576991
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## [1,] 1.0319081 -0.6577078 1.2214385 0.2510088 0.3611585
## [2,] 0.7315653 -0.8184106 -0.5431887 -0.2924962 0.4049085
## [3,] 1.2121137 -0.4970050 2.1299594 0.2682629 0.3174085
## [4,] 1.4623994 -0.9791134 1.0292513 1.1827317 -0.4263410
## [5,] 0.6614853 0.2261576 0.4002753 -0.3183774 0.3611585
## [6,] 1.3622851 -0.1755994 0.6623487 0.7298108 0.4049085
## OD280 Proline cluster
## [1,] 1.8427215 1.01015939 2
## [2,] 1.1103172 0.96252635 2
## [3,] 0.7863692 1.39122370 2
## [4,] 1.1807407 2.32800680 2
## [5,] 0.4483365 -0.03776747 2
## [6,] 0.3356589 2.23274072 2