Ejemplo 1(aglomerativos)

En este ejemplo utilizamos el conjunto de datos metcars con un cluster aglomerativo

head(mtcars,2) # examinamos el comjunto de datos

##               mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4

data <-scale(mtcars)# escalamos el conjunto 
data_d <- dist(data,method = "euclidean")

# cluster aglomerativo: método mínimo
clust1 <-hclust(data_d,method = "single")
plot(clust1,hang=-0.01,cex=0.6)

#método 2
clust2 <-hclust(data_d,method = "complete")
plot(clust2,hang=-0.01,cex=0.6)

#método 3
clust3 <-hclust(data_d,method ="ward.D2")
plot(clust3,hang=-0.01,cex=0.6)
rect.hclust(clust3,k=4,border = "orange")

clust3

## 
## Call:
## hclust(d = data_d, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 32

Ejemplo 2(cluster divisitivo)

library(cluster)

hc<-diana(scale(mtcars),metric = "euclidian")
par(mfrow=c(1,2))
plot(hc)

Ejemplo 3(utilizando factoextra,aglomerativo)

data_d<-dist(scale(USArrests)) # matriz de distancias
as.matrix(data_d)[1:6, 1:6] # muestra de la matriz

##             Alabama   Alaska  Arizona Arkansas California Colorado
## Alabama    0.000000 2.703754 2.293520 1.289810   3.263110 2.651067
## Alaska     2.703754 0.000000 2.700643 2.826039   3.012541 2.326519
## Arizona    2.293520 2.700643 0.000000 2.717758   1.310484 1.365031
## Arkansas   1.289810 2.826039 2.717758 0.000000   3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641   0.000000 1.287619
## Colorado   2.651067 2.326519 1.365031 2.831051   1.287619 0.000000

hc<- hclust(data_d, method = "ward.D2")
# Dendogramas
fviz_dend(hc, cex = 0.5)

fviz_dend(hc, cex = 0.5,horiz = T)

#cortando el arbol
cutree(hc,k=4)->ajuste
table(ajuste)

## ajuste
##  1  2  3  4 
##  7 12 19 12

rownames(USArrests)[ajuste==1]

## [1] "Alabama"        "Georgia"        "Louisiana"      "Mississippi"   
## [5] "North Carolina" "South Carolina" "Tennessee"

rownames(USArrests)[ajuste==2]

##  [1] "Alaska"     "Arizona"    "California" "Colorado"   "Florida"   
##  [6] "Illinois"   "Maryland"   "Michigan"   "Nevada"     "New Mexico"
## [11] "New York"   "Texas"

rownames(USArrests)[ajuste==3]

##  [1] "Arkansas"      "Connecticut"   "Delaware"      "Hawaii"       
##  [5] "Indiana"       "Kansas"        "Kentucky"      "Massachusetts"
##  [9] "Missouri"      "New Jersey"    "Ohio"          "Oklahoma"     
## [13] "Oregon"        "Pennsylvania"  "Rhode Island"  "Utah"         
## [17] "Virginia"      "Washington"    "Wyoming"

rownames(USArrests)[ajuste==4]

##  [1] "Idaho"         "Iowa"          "Maine"         "Minnesota"    
##  [5] "Montana"       "Nebraska"      "New Hampshire" "North Dakota" 
##  [9] "South Dakota"  "Vermont"       "West Virginia" "Wisconsin"

fviz_dend(hc, k=4,
          cex= 0.5,
          k_colors = c("yellow", "red", "orange", "blue"),
          color_labels_by_k = TRUE,
          rect = TRUE,
          rect_fill = T)

#gráfico utilizando componentes principales
fviz_cluster(list(data = USArrests, cluster = ajuste),palette = c("yellow", "steelblue", "salmon", "orange"),
             ellipse.type = "convex",
             show.clust.cent = FALSE,
             ggtheme = theme_minimal())

Ejemplo 4 (partitivo kmeans)

protein<-read.csv("protein.csv")
glimpse(protein)

## Observations: 25
## Variables: 10
## $ Country   <fct> Albania, Austria, Belgium, Bulgaria, Czechoslovakia,...
## $ RedMeat   <dbl> 10.1, 8.9, 13.5, 7.8, 9.7, 10.6, 8.4, 9.5, 18.0, 10....
## $ WhiteMeat <dbl> 1.4, 14.0, 9.3, 6.0, 11.4, 10.8, 11.6, 4.9, 9.9, 3.0...
## $ Eggs      <dbl> 0.5, 4.3, 4.1, 1.6, 2.8, 3.7, 3.7, 2.7, 3.3, 2.8, 2....
## $ Milk      <dbl> 8.9, 19.9, 17.5, 8.3, 12.5, 25.0, 11.1, 33.7, 19.5, ...
## $ Fish      <dbl> 0.2, 2.1, 4.5, 1.2, 2.0, 9.9, 5.4, 5.8, 5.7, 5.9, 0....
## $ Cereals   <dbl> 42.3, 28.0, 26.6, 56.7, 34.3, 21.9, 24.6, 26.3, 28.1...
## $ Starch    <dbl> 0.6, 3.6, 5.7, 1.1, 5.0, 4.8, 6.5, 5.1, 4.8, 2.2, 4....
## $ Nuts      <dbl> 5.5, 1.3, 2.1, 3.7, 1.1, 0.7, 0.8, 1.0, 2.4, 7.8, 5....
## $ Fr.Veg    <dbl> 1.7, 4.3, 4.0, 4.2, 4.0, 2.4, 3.6, 1.4, 6.5, 6.5, 4....

head(protein)

##          Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts
## 1        Albania    10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5
## 2        Austria     8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3
## 3        Belgium    13.5       9.3  4.1 17.5  4.5    26.6    5.7  2.1
## 4       Bulgaria     7.8       6.0  1.6  8.3  1.2    56.7    1.1  3.7
## 5 Czechoslovakia     9.7      11.4  2.8 12.5  2.0    34.3    5.0  1.1
## 6        Denmark    10.6      10.8  3.7 25.0  9.9    21.9    4.8  0.7
##   Fr.Veg
## 1    1.7
## 2    4.3
## 3    4.0
## 4    4.2
## 5    4.0
## 6    2.4

rownames(protein)<-protein$Country
df<-protein[-1]


df_d<-dist(scale(df))
set.seed(2)
kmeans(df_d,nstart = 25,centers = 4)->km_p
rownames(df)[km_p$cluster==1]

## [1] "Albania"    "Bulgaria"   "Romania"    "Yugoslavia"

rownames(df)[km_p$cluster==2]

## [1] "Greece"   "Portugal" "Spain"

rownames(df)[km_p$cluster==3]

## [1] "Czechoslovakia" "Hungary"        "Italy"          "Poland"        
## [5] "USSR"

rownames(df)[km_p$cluster==4]

##  [1] "Austria"     "Belgium"     "Denmark"     "E Germany"   "Finland"    
##  [6] "France"      "Ireland"     "Netherlands" "Norway"      "Sweden"     
## [11] "Switzerland" "UK"          "W Germany"

cluster=km_p$cluster
head(cbind(df,cluster))

##                RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr.Veg
## Albania           10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5    1.7
## Austria            8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3    4.3
## Belgium           13.5       9.3  4.1 17.5  4.5    26.6    5.7  2.1    4.0
## Bulgaria           7.8       6.0  1.6  8.3  1.2    56.7    1.1  3.7    4.2
## Czechoslovakia     9.7      11.4  2.8 12.5  2.0    34.3    5.0  1.1    4.0
## Denmark           10.6      10.8  3.7 25.0  9.9    21.9    4.8  0.7    2.4
##                cluster
## Albania              1
## Austria              4
## Belgium              4
## Bulgaria             1
## Czechoslovakia       3
## Denmark              4

#Consumo de proteinas  promedio en cada cluster
aggregate(df, by=list(cluster=km_p$cluster), mean)

##   cluster   RedMeat WhiteMeat     Eggs     Milk     Fish  Cereals   Starch
## 1       1  7.125000  4.675000 1.200000  9.45000 0.750000 51.12500 1.950000
## 2       2  7.833333  3.366667 2.333333 10.36667 9.033333 32.63333 4.600000
## 3       3  8.040000  8.740000 2.680000 14.36000 2.340000 38.18000 4.680000
## 4       4 11.807692  9.607692 3.707692 22.08462 5.023077 24.06923 4.761538
##       Nuts Fr.Veg
## 1 5.050000  2.975
## 2 6.133333  7.200
## 3 3.240000  4.880
## 4 1.692308  3.500

#Gráfico con componentes principales
fviz_cluster(km_p,data = df,      palette=c("orange","yellow","salmon","steelblue"),
             star.plot = TRUE,ellipse.type = "eclid",
             repel=T,
             ggtheme = theme_minimal())

#El número de cluster

fviz_nbclust(df, kmeans, method = "wss")+ geom_vline(xintercept = 4, linetype = 2)

Ejemplos de análisis de Cluster

Luis Bolaños

4/4/2020

Ejemplo 1(aglomerativos)

Ejemplo 2(cluster divisitivo)

Ejemplo 3(utilizando factoextra,aglomerativo)

Ejemplo 4 (partitivo kmeans)