Presentado por : Yeimy Carolina Tirado y Juan Carlos Ramírez title: “Ejercicios 4 de mayo” output: html_document


## CLUSTERIZACIÓN JERÁQUICO - Importar datos - cargar librerías 

library(readxl)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)

dfa = read_excel('C:/Users/CAROLINA/Downloads/chaa (1).xlsx')
head(dfa)
## # A tibble: 6 x 4
##   humedad almidon proteina harina
##     <dbl>   <dbl>    <dbl> <chr> 
## 1    7.72    45.9     10.1 cha   
## 2    7.65    46.0     10.6 cha   
## 3    9.12    49.3     11.9 cha   
## 4   10.3     49.3     10.9 cha   
## 5    9.73    53.7     11.8 cha   
## 6    8.03    44.9     10.6 cha
# datos no estandarizados - métodos distancias euclideas - matriz de distancia 

dist_mat <- dist(dfa[,-4],
                 method ='euclidean')

dim(as.matrix(dist_mat))
## [1] 90 90
# mapa de calor matriz de distancias. colores claros menores distancias 
heatmap(as.matrix(dist_mat))

# matríz de correlación de variables 
R = cor(dfa[,-4])
R
##             humedad    almidon   proteina
## humedad   1.0000000  0.8697394 -0.6366744
## almidon   0.8697394  1.0000000 -0.7928747
## proteina -0.6366744 -0.7928747  1.0000000
## mapa de calor con la matriz de correlaciones 
heatmap(R)

# matrices de correlaciones 

library(corrplot)
## corrplot 0.92 loaded
corrplot::corrplot(R, method="number")

corrplot.mixed(R, order="AOE")

## matrices de correlaciones 

library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(dfa[,-4]) 

# Métodos de clusterización single, complete, average. 
#single vecinos más cercanos 
Hierar_cl1 <- hclust(dist_mat, method = "single")
Hierar_cl1
## 
## Call:
## hclust(d = dist_mat, method = "single")
## 
## Cluster method   : single 
## Distance         : euclidean 
## Number of objects: 90
plot(Hierar_cl1)
abline(h=2.5, col = "red") # Punto de corte aleatorio para mirar grupos (4)

#método clusterización complete vecinos más lejanos 

Hierar_cl <- hclust(dist_mat, method = "complete")
Hierar_cl
## 
## Call:
## hclust(d = dist_mat, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 90
# con el método complete se forman tres grupos 
plot(Hierar_cl)
abline(h=15, col = "red") # 3 grupos
abline(h=30, col = "blue") # 2 grupos 
abline(h=10, col = "green") # 4 grupos

# método de clusterización average

Hierar_cl <- hclust(dist_mat, method = "average")
Hierar_cl
## 
## Call:
## hclust(d = dist_mat, method = "average")
## 
## Cluster method   : average 
## Distance         : euclidean 
## Number of objects: 90
plot(Hierar_cl)
abline(h=15, col = "red")

# método basado en suma de cuadrados 

Hierar_cl <- hclust(dist_mat, method = "ward.D")
Hierar_cl
## 
## Call:
## hclust(d = dist_mat, method = "ward.D")
## 
## Cluster method   : ward.D 
## Distance         : euclidean 
## Number of objects: 90
plot(Hierar_cl)
abline(h=15, col = "red")



mod1 <- cutree(Hierar_cl, k=3)
rect.hclust(Hierar_cl, k=3, border="green")

## se calculó para cada método la agrupación el mejor fue ward.D. más abajo se muestra algunos ejemplos de cálculos

table(dfa$harina,mod1)
##      mod1
##        1  2  3
##   ama  0 30  0
##   arr  0  1 29
##   cha 27  3  0
# se toman los valores de la tabla previa para calcular el porcentaje de clasificación 

100*(27+29+30)/90
## [1] 95.55556
# Modelo 2: Vecino mas cercano

plot(Hierar_cl1)
mod2<-cutree(Hierar_cl1,k=3)
# Para cortar el dendograma en k grupos y los encierra en rectangulos

rect.hclust(Hierar_cl1,k=3,border = "green")

# Porcentaje de correctas clasificaciones vecino mas cercano. se hace lo mismo para vecinos lejanos y avergae. 

table(dfa$harina,mod2) # Agrupación clusteres formados
##      mod2
##        1  2  3
##   ama  0  0 30
##   arr  0  0 30
##   cha 27  3  0
((30+30+27)/90)*100
## [1] 96.66667
library(caret)
## Loading required package: lattice
library(cvms)
library(tibble)

# Metodo Ward.D (Modelo 1)

# Creando un nuevo df donde se compara la predicción (prediction) vs la clasificación original (target)
df_cm<- tibble(
  prediction= factor(mod1,c(2,3,1),c('ama','arr','cha'),ordered = T),
target= factor(dfa$harina,ordered = T))
# Matriz de confusión
cm<- as.tibble(table(df_cm));cm
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 9 x 3
##   prediction target     n
##   <chr>      <chr>  <int>
## 1 ama        ama       30
## 2 arr        ama        0
## 3 cha        ama        0
## 4 ama        arr        1
## 5 arr        arr       29
## 6 cha        arr        0
## 7 ama        cha        3
## 8 arr        cha        0
## 9 cha        cha       27
# Grafico matriz de confusión
plot_confusion_matrix(cm,target_col = "target",
                      prediction_col = "prediction",
                      counts_col = "n")
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'rsvg' is missing. Will not plot arrows and zero-shading.

# Metodo Vecino mas cercano (Modelo 2)

# Creando un nuevo df donde se compara la predicción (prediction) vs la clasificación original (target)

df_cm2<- tibble(
  prediction= factor(mod2,c(3,3,1),c('ama','arr','cha'),ordered = T),
target= factor(dfa$harina,ordered = T))
# Matriz de confusión
cm2<- as.tibble(table(df_cm2))
plot_confusion_matrix(cm2,target_col = "target",
                      prediction_col = "prediction",
                      counts_col = "n")
## Warning in plot_confusion_matrix(cm2, target_col = "target", prediction_col =
## "prediction", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(cm2, target_col = "target", prediction_col =
## "prediction", : 'rsvg' is missing. Will not plot arrows and zero-shading.

##Se deben estandarizar las variables para quitarle las unidades (método score Z)


dfb=scale(dfa[,1:3])
m<-c("average", "single","complete","ward")
names(m) <- c("average", "single","complete","ward")
ac <- function(x){
    agnes(dfb, method=x)$ac
}
###El mejor método es el que arroje un valor más cercano a 1 (en teoría el de Ward.D es el mejor y se basa en suma de cuadrados)

sapply(m,ac) # mejor método Ward. D
##   average    single  complete      ward 
## 0.9183692 0.8615611 0.9460516 0.9868489
# aplicando la función agnes a los datos no estandarizados para comparar los metodos de clusterización 
dfb1<-dfa[,1:3]
m<- c("average","single","complete","ward")
ac1 <-function(x){
  agnes(dfb1,method = x)$ac
}
# Comparación entre metodos AGNES (Entre mas cercano a 1, mejor es el metodo)

sapply(m,ac1) # El mejor metodo es Ward.D
##   average    single  complete      ward 
## 0.9585437 0.8229332 0.9749480 0.9926597
###Analizar cuál es el mejor número de clusters a datos estandarizados 

gap_stat <- clusGap(dfb, FUN=hcut, nstart=25, K.max = 10, B=50)
##Hacer el plot de los clusters
fviz_gap_stat(gap_stat)

###Resumen de la información de los cluster

dfa$clust <- mod1
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
b<- dfa %>%
  group_by(clust)%>%
  summarise(mean(humedad),
            mean(almidon),
            mean(proteina))
View(b)



###Sacar estadísticos y hacer análisis gráfico de los datos

y= cbind(dfa$humedad,dfa$almidon, dfa$proteina);y
##            [,1]     [,2]      [,3]
##  [1,]  7.721578 45.90128 10.139913
##  [2,]  7.652724 46.04431 10.598239
##  [3,]  9.123838 49.27112 11.897356
##  [4,] 10.268620 49.29338 10.925669
##  [5,]  9.725898 53.65593 11.762685
##  [6,]  8.032232 44.87319 10.588989
##  [7,]  7.668702 44.41084 10.269907
##  [8,]  8.297506 47.25879 11.049940
##  [9,]  7.079020 42.37882 10.128377
## [10,]  8.372860 46.35557 10.890065
## [11,]  9.114401 48.73582 11.170282
## [12,]  7.624699 44.52850 10.061102
## [13,]  7.320832 43.71180 10.838241
## [14,]  7.918740 47.71021 10.994711
## [15,]  7.511419 47.13314 10.973178
## [16,]  7.432836 42.15872 10.452795
## [17,]  7.048295 42.45157 10.653445
## [18,]  9.495883 54.26860 12.213783
## [19,]  9.566397 52.33493 11.850099
## [20,]  8.428448 46.49959 10.900030
## [21,]  7.137224 41.30813  9.902352
## [22,]  7.996107 43.88332 10.601019
## [23,]  7.857408 46.03987 10.083055
## [24,]  8.475420 46.82790 10.963417
## [25,]  7.770122 44.57957 10.869224
## [26,]  8.337881 46.23931 10.272636
## [27,]  7.322514 43.44046 10.863603
## [28,]  7.120598 44.54915 10.050192
## [29,]  8.086297 47.17898 10.392180
## [30,]  8.476982 46.36449 10.825985
## [31,] 11.406200 61.52768  8.854925
## [32,] 11.865100 62.42881  9.582678
## [33,] 10.994501 56.17152  8.378672
## [34,]  8.547076 54.75710  7.944448
## [35,] 10.139319 58.35008  8.047774
## [36,] 11.251462 59.55176  8.516003
## [37,] 10.721676 56.58344  8.620696
## [38,] 11.169609 55.60496  8.312420
## [39,] 11.709281 58.70984  8.901657
## [40,] 10.631209 59.47148  8.164276
## [41,] 11.837275 61.97836  8.696064
## [42,] 10.899388 60.19528  8.676086
## [43,] 11.034178 59.11214  8.234401
## [44,] 10.364361 56.64930  8.074072
## [45,]  9.453628 55.83020  8.448407
## [46,] 11.304953 59.98648  8.208168
## [47,] 11.817252 58.19939  8.569444
## [48,]  8.854873 53.88221  7.792924
## [49,] 11.705227 56.49285  9.178956
## [50,]  9.864662 57.04852  7.720985
## [51,] 10.976432 58.43758  8.014868
## [52,]  9.758869 53.97065  7.716782
## [53,] 11.104813 54.70654  8.511586
## [54,] 10.752708 62.26898  8.319577
## [55,] 10.770620 56.78398  8.050882
## [56,] 10.874112 59.70302  9.000629
## [57,] 10.591282 60.77411  9.171865
## [58,] 11.443537 60.78443  9.100978
## [59,] 10.878284 57.91546  8.019242
## [60,]  7.694603 52.56606  7.072489
## [61,] 11.639415 69.06842  7.613599
## [62,] 10.560772 66.15191  7.179469
## [63,] 12.900343 70.87005  8.780711
## [64,] 11.536620 73.11278  8.425234
## [65,] 10.154163 61.85292  6.785652
## [66,] 11.339808 68.91843  7.454255
## [67,] 10.168894 66.09209  6.453378
## [68,] 12.010602 70.16086  8.223884
## [69,] 11.194967 72.04267  7.661631
## [70,] 10.411941 66.11417  6.885927
## [71,] 10.869293 65.10160  7.136257
## [72,] 10.965178 71.73988  7.507435
## [73,] 10.592406 66.33288  7.683299
## [74,] 10.677790 66.10951  6.824915
## [75,] 11.148594 70.48496  7.224137
## [76,] 11.253639 66.65250  7.103698
## [77,] 12.338184 71.15704  8.292266
## [78,] 11.690715 69.72544  7.711396
## [79,]  9.977745 68.58767  7.420820
## [80,] 10.762089 66.60586  7.493518
## [81,] 11.902305 70.32080  7.424477
## [82,] 11.073847 66.09679  6.724237
## [83,] 10.378991 69.68403  7.365961
## [84,] 11.920298 67.07839  7.663896
## [85,] 10.146433 71.52956  7.488857
## [86,] 12.372345 71.70629  7.511267
## [87,] 10.926828 65.26694  6.679849
## [88,] 11.028630 65.52440  7.168652
## [89,] 11.240799 68.61351  7.801128
## [90,] 12.772005 71.14063  7.872976
## CLUSTER DE PARTICIONES 

mod2=manova(y~dfa$harina)
summary(mod2) # Hay diferencias entre las harinas
##            Df Pillai approx F num Df den Df    Pr(>F)    
## dfa$harina  2 1.4658    78.65      6    172 < 2.2e-16 ***
## Residuals  87                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Gráfico 3D  para buscar patrones 

library(rgl)

cols=ifelse(dfa$harina=="cha", 1,
      ifelse(dfa$harina=="ama", 2, 3)
      )

plot3d(dfa$humedad,dfa$proteina,dfa$almidon, col=cols, size = 6)


###Si quito una variable hago una figura 2D
plot(dfa$humedad, dfa$almidon, col=cols)

###Gráfico de sedimentación (puede indicar el número de grupos óptimo)

fviz_nbclust(dfb, kmeans, method="wss")

###Método kmeans

km <- kmeans(dfb, centers = 3, nstart = 25)
km
## K-means clustering with 3 clusters of sizes 30, 30, 30
## 
## Cluster means:
##      humedad     almidon   proteina
## 1  0.4212798  0.04391756 -0.3166988
## 2  0.7438022  1.14051632 -0.9437156
## 3 -1.1650820 -1.18443389  1.2604144
## 
## Clustering vector:
##  [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [77] 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 17.16446 12.92903 15.70853
##  (between_SS / total_SS =  82.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
fviz_cluster(km, data=dfb)

###Encontrar la media de cada cluster
aggregate(dfb, by=list(clustyer=km$cluster), mean)
##   clustyer    humedad     almidon   proteina
## 1        1  0.4212798  0.04391756 -0.3166988
## 2        2  0.7438022  1.14051632 -0.9437156
## 3        3 -1.1650820 -1.18443389  1.2604144