EjercicioClase - Análisis de Clusters

0. Instalación de librerias.

library(readxl)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)
library(cvms)
library(tibble)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rgl)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(corrplot)

## corrplot 0.92 loaded

library(PerformanceAnalytics)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

1. Entrada y visualización inicial de datos.

Cargando datos

dfa = read_excel("C:/Users/daorjuelah/Downloads/chaa.xlsx")
head(dfa)

## # A tibble: 6 x 4
##   humedad almidon proteina harina
##     <dbl>   <dbl>    <dbl> <chr> 
## 1    7.72    45.9     10.1 cha   
## 2    7.65    46.0     10.6 cha   
## 3    9.12    49.3     11.9 cha   
## 4   10.3     49.3     10.9 cha   
## 5    9.73    53.7     11.8 cha   
## 6    8.03    44.9     10.6 cha

Creación de una matriz de distancias

#usamos solo los valores cuantitativos (quitamos la ultima columna que es una etiqueta)
dist_mat <- dist(dfa[, -4], method = 'euclidean')

dim(as.matrix(dist_mat))

## [1] 90 90

Plot del mapa de calor de cluster

heatmap(as.matrix(dist_mat))

Mapa de calor con la matriz de correlaciones

R <- cor(dfa[, -4]); R

##             humedad    almidon   proteina
## humedad   1.0000000  0.8697394 -0.6366744
## almidon   0.8697394  1.0000000 -0.7928747
## proteina -0.6366744 -0.7928747  1.0000000

heatmap(R)

Plot matriz de correlaciones

#Usamos la libreria Corrplot
corrplot(R, method = 'number')

corrplot(R, method = 'ellipse')

corrplot.mixed(R)

patrones de agrupación en graficos de dispersión

chart.Correlation(dfa[,-4])

2. Método vecino mas cercano

Graficamos los dendogramas, metodo vecino cercano

Hierar_cl <- hclust(dist_mat, method = 'single'); Hierar_cl

## 
## Call:
## hclust(d = dist_mat, method = "single")
## 
## Cluster method   : single 
## Distance         : euclidean 
## Number of objects: 90

plot(Hierar_cl)
abline(h = 12, col = 'red')

#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)

## mod1
##  1  2  3 
## 27  3 60

#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")

Matriz de confusion

table(dfa$harina, mod1)

##      mod1
##        1  2  3
##   ama  0  0 30
##   arr  0  0 30
##   cha 27  3  0

Porcentaje de aciertos de clasificacion

porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass

## [1]   63.33333 1140.00000

3. Método vecino mas lejano

Graficamos los dendogramas, metodo vecino mas lejano

Hierar_cl <- hclust(dist_mat, method = 'complete'); Hierar_cl

## 
## Call:
## hclust(d = dist_mat, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 90

plot(Hierar_cl)
abline(h = 12, col = 'red')

#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)

## mod1
##  1  2  3 
## 27 34 29

#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")

Matriz de confusion

table(dfa$harina, mod1)

##      mod1
##        1  2  3
##   ama  0 30  0
##   arr  0  1 29
##   cha 27  3  0

#Segundo metodo de ploteo con libreria tibble
df_cm = tibble(prediction = factor(mod1, 
                                   c(2,1,3), 
                                   c('ama','arr','cha'), 
                                   ordered = TRUE),
               target = factor(dfa$harina, 
                               ordered = TRUE)
               )

cm = as_tibble(table(df_cm))

plot_confusion_matrix(cm, 
                      target_col = 'target', 
                      prediction_col = 'prediction', 
                      counts_col = 'n')

## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'ggimage' is missing. Will not plot arrows and zero-shading.

## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'rsvg' is missing. Will not plot arrows and zero-shading.

Porcentaje de aciertos de clasificacion

porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass

## [1]   95.55556 1720.00000

4. Método suma de cuadrados por las varianzas.

Graficamos los dendogramas, suma de cuadrados por las varianzas

Hierar_cl <- hclust(dist_mat, method = 'ward.D'); Hierar_cl

## 
## Call:
## hclust(d = dist_mat, method = "ward.D")
## 
## Cluster method   : ward.D 
## Distance         : euclidean 
## Number of objects: 90

plot(Hierar_cl)
abline(h = 12, col = 'red')

#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)

## mod1
##  1  2  3 
## 27 34 29

#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")

Matriz de confusion

table(dfa$harina, mod1)

##      mod1
##        1  2  3
##   ama  0 30  0
##   arr  0  1 29
##   cha 27  3  0

Porcentaje de aciertos de clasificacion

porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass

## [1]   95.55556 1720.00000

4. Método de cluster para el conjunto de datos

Estandarlizar la muestra para poder utilizar los metodos a continuacion

dfb=scale(dfa[, 1:3])
m <- c("average", "single", "complete", "ward")
names(m) <- c("average", "single", "complete", "ward")
ac <- function(x){agnes(dfb, method = x)$ac}

sapply(m, ac)

##   average    single  complete      ward 
## 0.9183692 0.8615611 0.9460516 0.9868489

Calculando gap statistic for each number of clusters

#Determinar el mejor numero de cluster para el conjunto de datos
gap_stat <- clusGap(dfb, 
                    FUN = hcut,
                    nstart = 25, 
                    K.max = 10,
                    B = 50)

#Produce plot of clusters vs gap statistic
fviz_gap_stat(gap_stat)

dfa$clust <- mod1

#Separar y visualizar los resultados
b <- dfa %>% group_by(clust) %>% summarise(mean(humedad),mean(almidon),mean(proteina))
#table(b)

Grafica de sedimentacion para indicar el numero de clusters se deberian tomar

fviz_nbclust(dfb, kmeans, method = 'wss')

y = cbind(dfa$humedad, dfa$almidon, dfa$proteina)
mod2 = manova(y~dfa$harina)
summary(mod2)

##            Df Pillai approx F num Df den Df    Pr(>F)    
## dfa$harina  2 1.4658    78.65      6    172 < 2.2e-16 ***
## Residuals  87                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Visualizacion en 3d de los datos

plot3d(dfa$humedad, dfa$almidon, dfa$proteina, col = c('red', 'blue', 'green'))

cols <- ifelse(dfa$harina == 'cha', 1, ifelse(dfa$harina == 'ama', 2, 3))
#plot_ly(dfa[, 1:3], x = ~humedad, y = ~almidon, z = ~proteina, colors = cols, size = 5)

plot3d(dfa$humedad, dfa$almidon, dfa$proteina, col = cols, size = 5)

Visualización en 2d de correlacion

plot(dfa$humedad~dfa$almidon)

Creamos un cluster de los datos, para 3 centros

set.seed(1990)

#Perform k-means clustering with k=4 clusters
km <- kmeans(dfb, centers = 3, nstart = 25)

fviz_cluster(km, data = dfb)

aggregate(dfb, by=list(cluster=km$cluster), mean)

##   cluster    humedad     almidon   proteina
## 1       1 -1.1650820 -1.18443389  1.2604144
## 2       2  0.4212798  0.04391756 -0.3166988
## 3       3  0.7438022  1.14051632 -0.9437156

EjercicioClase - Análisis de Clusters

Diego Alejandro Orjuela Hurtado

2022-05-04

0. Instalación de librerias.

1. Entrada y visualización inicial de datos.

2. Método vecino mas cercano

3. Método vecino mas lejano

4. Método suma de cuadrados por las varianzas.

4. Método de cluster para el conjunto de datos