#1.- DATOS

#Instalamos primero el paquete "rattle"
#-----------------------------------------------
data(wine, package = "rattle")
head(wine)
##   Type Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1    1   14.23  1.71 2.43       15.6       127    2.80       3.06          0.28
## 2    1   13.20  1.78 2.14       11.2       100    2.65       2.76          0.26
## 3    1   13.16  2.36 2.67       18.6       101    2.80       3.24          0.30
## 4    1   14.37  1.95 2.50       16.8       113    3.85       3.49          0.24
## 5    1   13.24  2.59 2.87       21.0       118    2.80       2.69          0.39
## 6    1   14.20  1.76 2.45       15.2       112    3.27       3.39          0.34
##   Proanthocyanins Color  Hue Dilution Proline
## 1            2.29  5.64 1.04     3.92    1065
## 2            1.28  4.38 1.05     3.40    1050
## 3            2.81  5.68 1.03     3.17    1185
## 4            2.18  7.80 0.86     3.45    1480
## 5            1.82  4.32 1.04     2.93     735
## 6            1.97  6.75 1.05     2.85    1450

Hacemos un structure para comprobar la estructura de los datos.

# Estructura
#-----------------------
str(wine)
## 'data.frame':    178 obs. of  14 variables:
##  $ Type           : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol        : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash            : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Alcalinity     : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium      : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Phenols        : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids     : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoids  : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins: num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color          : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue            : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ Dilution       : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline        : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...

Es importante que tengamos una variable clasificadora “TYPE”, el resto son cuantitativas, así que para la normalización hay que acordarse de sólo aplicarla desde Alcohol a Proline.

# Escalamiento
#-----------------------------------------
wine.stand <- scale(wine[-1])  # Ponemos -1 para que no coja la clasificadora)

Comparaos los descriptivos de wine y wine.stand

# Descriptivos del data frame wine
#-------------------
summary(wine)
##  Type      Alcohol          Malic            Ash          Alcalinity   
##  1:59   Min.   :11.03   Min.   :0.740   Min.   :1.360   Min.   :10.60  
##  2:71   1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210   1st Qu.:17.20  
##  3:48   Median :13.05   Median :1.865   Median :2.360   Median :19.50  
##         Mean   :13.00   Mean   :2.336   Mean   :2.367   Mean   :19.49  
##         3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558   3rd Qu.:21.50  
##         Max.   :14.83   Max.   :5.800   Max.   :3.230   Max.   :30.00  
##    Magnesium         Phenols        Flavanoids    Nonflavanoids   
##  Min.   : 70.00   Min.   :0.980   Min.   :0.340   Min.   :0.1300  
##  1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205   1st Qu.:0.2700  
##  Median : 98.00   Median :2.355   Median :2.135   Median :0.3400  
##  Mean   : 99.74   Mean   :2.295   Mean   :2.029   Mean   :0.3619  
##  3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875   3rd Qu.:0.4375  
##  Max.   :162.00   Max.   :3.880   Max.   :5.080   Max.   :0.6600  
##  Proanthocyanins     Color             Hue            Dilution    
##  Min.   :0.410   Min.   : 1.280   Min.   :0.4800   Min.   :1.270  
##  1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825   1st Qu.:1.938  
##  Median :1.555   Median : 4.690   Median :0.9650   Median :2.780  
##  Mean   :1.591   Mean   : 5.058   Mean   :0.9574   Mean   :2.612  
##  3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200   3rd Qu.:3.170  
##  Max.   :3.580   Max.   :13.000   Max.   :1.7100   Max.   :4.000  
##     Proline      
##  Min.   : 278.0  
##  1st Qu.: 500.5  
##  Median : 673.5  
##  Mean   : 746.9  
##  3rd Qu.: 985.0  
##  Max.   :1680.0
# Descriptivos del data frame wine.stand
#-------------------------------------------------------
summary(wine.stand)
##     Alcohol             Malic              Ash             Alcalinity       
##  Min.   :-2.42739   Min.   :-1.4290   Min.   :-3.66881   Min.   :-2.663505  
##  1st Qu.:-0.78603   1st Qu.:-0.6569   1st Qu.:-0.57051   1st Qu.:-0.687199  
##  Median : 0.06083   Median :-0.4219   Median :-0.02375   Median : 0.001514  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.83378   3rd Qu.: 0.6679   3rd Qu.: 0.69615   3rd Qu.: 0.600395  
##  Max.   : 2.25341   Max.   : 3.1004   Max.   : 3.14745   Max.   : 3.145637  
##    Magnesium          Phenols           Flavanoids      Nonflavanoids    
##  Min.   :-2.0824   Min.   :-2.10132   Min.   :-1.6912   Min.   :-1.8630  
##  1st Qu.:-0.8221   1st Qu.:-0.88298   1st Qu.:-0.8252   1st Qu.:-0.7381  
##  Median :-0.1219   Median : 0.09569   Median : 0.1059   Median :-0.1756  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5082   3rd Qu.: 0.80672   3rd Qu.: 0.8467   3rd Qu.: 0.6078  
##  Max.   : 4.3591   Max.   : 2.53237   Max.   : 3.0542   Max.   : 2.3956  
##  Proanthocyanins        Color              Hue              Dilution      
##  Min.   :-2.06321   Min.   :-1.6297   Min.   :-2.08884   Min.   :-1.8897  
##  1st Qu.:-0.59560   1st Qu.:-0.7929   1st Qu.:-0.76540   1st Qu.:-0.9496  
##  Median :-0.06272   Median :-0.1588   Median : 0.03303   Median : 0.2371  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.62741   3rd Qu.: 0.4926   3rd Qu.: 0.71116   3rd Qu.: 0.7864  
##  Max.   : 3.47527   Max.   : 3.4258   Max.   : 3.29241   Max.   : 1.9554  
##     Proline       
##  Min.   :-1.4890  
##  1st Qu.:-0.7824  
##  Median :-0.2331  
##  Mean   : 0.0000  
##  3rd Qu.: 0.7561  
##  Max.   : 2.9631

Todas las medias son cero, así que se han estandarizado bien.

#2.- K-Medias

k.means.fit <- kmeans(wine.stand, 3)   # k = 3
# Tipo de comandos del k means
#---------------------------------
attributes(k.means.fit)
## $names
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## $class
## [1] "kmeans"
#centroides
#----------------------------------
k.means.fit$centers   # centroids
##      Alcohol      Malic        Ash Alcalinity   Magnesium     Phenols
## 1  0.1644436  0.8690954  0.1863726  0.5228924 -0.07526047 -0.97657548
## 2 -0.9234669 -0.3929331 -0.4931257  0.1701220 -0.49032869 -0.07576891
## 3  0.8328826 -0.3029551  0.3636801 -0.6084749  0.57596208  0.88274724
##    Flavanoids Nonflavanoids Proanthocyanins      Color        Hue   Dilution
## 1 -1.21182921    0.72402116     -0.77751312  0.9388902 -1.1615122 -1.2887761
## 2  0.02075402   -0.03343924      0.05810161 -0.8993770  0.4605046  0.2700025
## 3  0.97506900   -0.56050853      0.57865427  0.1705823  0.4726504  0.7770551
##      Proline
## 1 -0.4059428
## 2 -0.7517257
## 3  1.1220202

Aquí tenemos las medias de cada conglomerado por cada variable analizada.

# Clusters:
#---------------------------------
k.means.fit$cluster
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 3
##  [75] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 1 2 2 3 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# Tamaño de los clusters
#----------------------------------
k.means.fit$size
## [1] 51 65 62
# Criterio de elbow para la selección de cluster en función de la varianza explicada
#---------------------------------------------------------
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

wssplot(wine.stand, nc=6)

El número óptimo de clusters es 3, que es donde hace el codo el gráfico.

# Representación de los clusters
#----------------------------------------
library(cluster)
clusplot(wine.stand, k.means.fit$cluster, main = '2D representation of the cluster solution',
         color = TRUE, shade = TRUE,
         labels = 2, lines = 0)

# Matriz de confusión
#---------------------------------------
table(wine[,1],k.means.fit$cluster)
##    
##      1  2  3
##   1  0  0 59
##   2  3 65  3
##   3 48  0  0

#** 3.- CLUSTER JERÁRQUICO**

# Distancias euclídeas
#-----------------------------
d <- dist(wine.stand, method = "euclidean")
# Vinculación por método de ward
H.fit <- hclust(d, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# Representación del dendograma
#----------------------------------------
plot(H.fit)
groups <- cutree(H.fit, k=3)
rect.hclust(H.fit, k=3, border = "red")     # Así se seleccionan los clusters para k = 3

# Matriz de confusión
#---------------------------------
table(wine[,1],groups)
##    groups
##      1  2  3
##   1 58  1  0
##   2  7 58  6
##   3  0  0 48