En este ejemplo utilizamos el conjunto de datos metcars con un cluster aglomerativo
head(mtcars,2) # examinamos el comjunto de datos
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.875 17.02 0 1 4 4
data <-scale(mtcars)# escalamos el conjunto
data_d <- dist(data,method = "euclidean")
# cluster aglomerativo: método mínimo
clust1 <-hclust(data_d,method = "single")
plot(clust1,hang=-0.01,cex=0.6)
#método 2
clust2 <-hclust(data_d,method = "complete")
plot(clust2,hang=-0.01,cex=0.6)
#método 3
clust3 <-hclust(data_d,method ="ward.D2")
plot(clust3,hang=-0.01,cex=0.6)
rect.hclust(clust3,k=4,border = "orange")
clust3
##
## Call:
## hclust(d = data_d, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 32
library(cluster)
hc<-diana(scale(mtcars),metric = "euclidian")
par(mfrow=c(1,2))
plot(hc)
data_d<-dist(scale(USArrests)) # matriz de distancias
as.matrix(data_d)[1:6, 1:6] # muestra de la matriz
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.000000 2.703754 2.293520 1.289810 3.263110 2.651067
## Alaska 2.703754 0.000000 2.700643 2.826039 3.012541 2.326519
## Arizona 2.293520 2.700643 0.000000 2.717758 1.310484 1.365031
## Arkansas 1.289810 2.826039 2.717758 0.000000 3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641 0.000000 1.287619
## Colorado 2.651067 2.326519 1.365031 2.831051 1.287619 0.000000
hc<- hclust(data_d, method = "ward.D2")
# Dendogramas
fviz_dend(hc, cex = 0.5)
fviz_dend(hc, cex = 0.5,horiz = T)
#cortando el arbol
cutree(hc,k=4)->ajuste
table(ajuste)
## ajuste
## 1 2 3 4
## 7 12 19 12
rownames(USArrests)[ajuste==1]
## [1] "Alabama" "Georgia" "Louisiana" "Mississippi"
## [5] "North Carolina" "South Carolina" "Tennessee"
rownames(USArrests)[ajuste==2]
## [1] "Alaska" "Arizona" "California" "Colorado" "Florida"
## [6] "Illinois" "Maryland" "Michigan" "Nevada" "New Mexico"
## [11] "New York" "Texas"
rownames(USArrests)[ajuste==3]
## [1] "Arkansas" "Connecticut" "Delaware" "Hawaii"
## [5] "Indiana" "Kansas" "Kentucky" "Massachusetts"
## [9] "Missouri" "New Jersey" "Ohio" "Oklahoma"
## [13] "Oregon" "Pennsylvania" "Rhode Island" "Utah"
## [17] "Virginia" "Washington" "Wyoming"
rownames(USArrests)[ajuste==4]
## [1] "Idaho" "Iowa" "Maine" "Minnesota"
## [5] "Montana" "Nebraska" "New Hampshire" "North Dakota"
## [9] "South Dakota" "Vermont" "West Virginia" "Wisconsin"
fviz_dend(hc, k=4,
cex= 0.5,
k_colors = c("yellow", "red", "orange", "blue"),
color_labels_by_k = TRUE,
rect = TRUE,
rect_fill = T)
#gráfico utilizando componentes principales
fviz_cluster(list(data = USArrests, cluster = ajuste),palette = c("yellow", "steelblue", "salmon", "orange"),
ellipse.type = "convex",
show.clust.cent = FALSE,
ggtheme = theme_minimal())
protein<-read.csv("protein.csv")
glimpse(protein)
## Observations: 25
## Variables: 10
## $ Country <fct> Albania, Austria, Belgium, Bulgaria, Czechoslovakia,...
## $ RedMeat <dbl> 10.1, 8.9, 13.5, 7.8, 9.7, 10.6, 8.4, 9.5, 18.0, 10....
## $ WhiteMeat <dbl> 1.4, 14.0, 9.3, 6.0, 11.4, 10.8, 11.6, 4.9, 9.9, 3.0...
## $ Eggs <dbl> 0.5, 4.3, 4.1, 1.6, 2.8, 3.7, 3.7, 2.7, 3.3, 2.8, 2....
## $ Milk <dbl> 8.9, 19.9, 17.5, 8.3, 12.5, 25.0, 11.1, 33.7, 19.5, ...
## $ Fish <dbl> 0.2, 2.1, 4.5, 1.2, 2.0, 9.9, 5.4, 5.8, 5.7, 5.9, 0....
## $ Cereals <dbl> 42.3, 28.0, 26.6, 56.7, 34.3, 21.9, 24.6, 26.3, 28.1...
## $ Starch <dbl> 0.6, 3.6, 5.7, 1.1, 5.0, 4.8, 6.5, 5.1, 4.8, 2.2, 4....
## $ Nuts <dbl> 5.5, 1.3, 2.1, 3.7, 1.1, 0.7, 0.8, 1.0, 2.4, 7.8, 5....
## $ Fr.Veg <dbl> 1.7, 4.3, 4.0, 4.2, 4.0, 2.4, 3.6, 1.4, 6.5, 6.5, 4....
head(protein)
## Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts
## 1 Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5
## 2 Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3
## 3 Belgium 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1
## 4 Bulgaria 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7
## 5 Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1
## 6 Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7
## Fr.Veg
## 1 1.7
## 2 4.3
## 3 4.0
## 4 4.2
## 5 4.0
## 6 2.4
rownames(protein)<-protein$Country
df<-protein[-1]
df_d<-dist(scale(df))
set.seed(2)
kmeans(df_d,nstart = 25,centers = 4)->km_p
rownames(df)[km_p$cluster==1]
## [1] "Albania" "Bulgaria" "Romania" "Yugoslavia"
rownames(df)[km_p$cluster==2]
## [1] "Greece" "Portugal" "Spain"
rownames(df)[km_p$cluster==3]
## [1] "Czechoslovakia" "Hungary" "Italy" "Poland"
## [5] "USSR"
rownames(df)[km_p$cluster==4]
## [1] "Austria" "Belgium" "Denmark" "E Germany" "Finland"
## [6] "France" "Ireland" "Netherlands" "Norway" "Sweden"
## [11] "Switzerland" "UK" "W Germany"
cluster=km_p$cluster
head(cbind(df,cluster))
## RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr.Veg
## Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7
## Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3
## Belgium 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0
## Bulgaria 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7 4.2
## Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0
## Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4
## cluster
## Albania 1
## Austria 4
## Belgium 4
## Bulgaria 1
## Czechoslovakia 3
## Denmark 4
#Consumo de proteinas promedio en cada cluster
aggregate(df, by=list(cluster=km_p$cluster), mean)
## cluster RedMeat WhiteMeat Eggs Milk Fish Cereals Starch
## 1 1 7.125000 4.675000 1.200000 9.45000 0.750000 51.12500 1.950000
## 2 2 7.833333 3.366667 2.333333 10.36667 9.033333 32.63333 4.600000
## 3 3 8.040000 8.740000 2.680000 14.36000 2.340000 38.18000 4.680000
## 4 4 11.807692 9.607692 3.707692 22.08462 5.023077 24.06923 4.761538
## Nuts Fr.Veg
## 1 5.050000 2.975
## 2 6.133333 7.200
## 3 3.240000 4.880
## 4 1.692308 3.500
#Gráfico con componentes principales
fviz_cluster(km_p,data = df, palette=c("orange","yellow","salmon","steelblue"),
star.plot = TRUE,ellipse.type = "eclid",
repel=T,
ggtheme = theme_minimal())
#El número de cluster
fviz_nbclust(df, kmeans, method = "wss")+ geom_vline(xintercept = 4, linetype = 2)