0. InstalaciĂ³n de librerias.

library(readxl)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(cvms)
library(tibble)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rgl)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(corrplot)
## corrplot 0.92 loaded
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend

1. Entrada y visualizaciĂ³n inicial de datos.

Cargando datos

dfa = read_excel("C:/Users/daorjuelah/Downloads/chaa.xlsx")
head(dfa)
## # A tibble: 6 x 4
##   humedad almidon proteina harina
##     <dbl>   <dbl>    <dbl> <chr> 
## 1    7.72    45.9     10.1 cha   
## 2    7.65    46.0     10.6 cha   
## 3    9.12    49.3     11.9 cha   
## 4   10.3     49.3     10.9 cha   
## 5    9.73    53.7     11.8 cha   
## 6    8.03    44.9     10.6 cha

CreaciĂ³n de una matriz de distancias

#usamos solo los valores cuantitativos (quitamos la ultima columna que es una etiqueta)
dist_mat <- dist(dfa[, -4], method = 'euclidean')

dim(as.matrix(dist_mat))
## [1] 90 90

Plot del mapa de calor de cluster

heatmap(as.matrix(dist_mat))

Mapa de calor con la matriz de correlaciones

R <- cor(dfa[, -4]); R
##             humedad    almidon   proteina
## humedad   1.0000000  0.8697394 -0.6366744
## almidon   0.8697394  1.0000000 -0.7928747
## proteina -0.6366744 -0.7928747  1.0000000
heatmap(R)

Plot matriz de correlaciones

#Usamos la libreria Corrplot
corrplot(R, method = 'number')

corrplot(R, method = 'ellipse')

corrplot.mixed(R)

patrones de agrupaciĂ³n en graficos de dispersiĂ³n

chart.Correlation(dfa[,-4])

2. Método vecino mas cercano

Graficamos los dendogramas, metodo vecino cercano

Hierar_cl <- hclust(dist_mat, method = 'single'); Hierar_cl
## 
## Call:
## hclust(d = dist_mat, method = "single")
## 
## Cluster method   : single 
## Distance         : euclidean 
## Number of objects: 90
plot(Hierar_cl)
abline(h = 12, col = 'red')

#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)
## mod1
##  1  2  3 
## 27  3 60
#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")

Matriz de confusion

table(dfa$harina, mod1)
##      mod1
##        1  2  3
##   ama  0  0 30
##   arr  0  0 30
##   cha 27  3  0

Porcentaje de aciertos de clasificacion

porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass
## [1]   63.33333 1140.00000

3. Método vecino mas lejano

Graficamos los dendogramas, metodo vecino mas lejano

Hierar_cl <- hclust(dist_mat, method = 'complete'); Hierar_cl
## 
## Call:
## hclust(d = dist_mat, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 90
plot(Hierar_cl)
abline(h = 12, col = 'red')

#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)
## mod1
##  1  2  3 
## 27 34 29
#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")

Matriz de confusion

table(dfa$harina, mod1)
##      mod1
##        1  2  3
##   ama  0 30  0
##   arr  0  1 29
##   cha 27  3  0
#Segundo metodo de ploteo con libreria tibble
df_cm = tibble(prediction = factor(mod1, 
                                   c(2,1,3), 
                                   c('ama','arr','cha'), 
                                   ordered = TRUE),
               target = factor(dfa$harina, 
                               ordered = TRUE)
               )

cm = as_tibble(table(df_cm))

plot_confusion_matrix(cm, 
                      target_col = 'target', 
                      prediction_col = 'prediction', 
                      counts_col = 'n')
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'rsvg' is missing. Will not plot arrows and zero-shading.

Porcentaje de aciertos de clasificacion

porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass
## [1]   95.55556 1720.00000

4. Método suma de cuadrados por las varianzas.

Graficamos los dendogramas, suma de cuadrados por las varianzas

Hierar_cl <- hclust(dist_mat, method = 'ward.D'); Hierar_cl
## 
## Call:
## hclust(d = dist_mat, method = "ward.D")
## 
## Cluster method   : ward.D 
## Distance         : euclidean 
## Number of objects: 90
plot(Hierar_cl)
abline(h = 12, col = 'red')

#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)
## mod1
##  1  2  3 
## 27 34 29
#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")

Matriz de confusion

table(dfa$harina, mod1)
##      mod1
##        1  2  3
##   ama  0 30  0
##   arr  0  1 29
##   cha 27  3  0

Porcentaje de aciertos de clasificacion

porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass
## [1]   95.55556 1720.00000

4. Método de cluster para el conjunto de datos

Estandarlizar la muestra para poder utilizar los metodos a continuacion

dfb=scale(dfa[, 1:3])
m <- c("average", "single", "complete", "ward")
names(m) <- c("average", "single", "complete", "ward")
ac <- function(x){agnes(dfb, method = x)$ac}

sapply(m, ac)
##   average    single  complete      ward 
## 0.9183692 0.8615611 0.9460516 0.9868489

Calculando gap statistic for each number of clusters

#Determinar el mejor numero de cluster para el conjunto de datos
gap_stat <- clusGap(dfb, 
                    FUN = hcut,
                    nstart = 25, 
                    K.max = 10,
                    B = 50)

#Produce plot of clusters vs gap statistic
fviz_gap_stat(gap_stat)

dfa$clust <- mod1

#Separar y visualizar los resultados
b <- dfa %>% group_by(clust) %>% summarise(mean(humedad),mean(almidon),mean(proteina))
#table(b)

Grafica de sedimentacion para indicar el numero de clusters se deberian tomar

fviz_nbclust(dfb, kmeans, method = 'wss')

y = cbind(dfa$humedad, dfa$almidon, dfa$proteina)
mod2 = manova(y~dfa$harina)
summary(mod2)
##            Df Pillai approx F num Df den Df    Pr(>F)    
## dfa$harina  2 1.4658    78.65      6    172 < 2.2e-16 ***
## Residuals  87                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Visualizacion en 3d de los datos

plot3d(dfa$humedad, dfa$almidon, dfa$proteina, col = c('red', 'blue', 'green'))

cols <- ifelse(dfa$harina == 'cha', 1, ifelse(dfa$harina == 'ama', 2, 3))
#plot_ly(dfa[, 1:3], x = ~humedad, y = ~almidon, z = ~proteina, colors = cols, size = 5)

plot3d(dfa$humedad, dfa$almidon, dfa$proteina, col = cols, size = 5)

VisualizaciĂ³n en 2d de correlacion

plot(dfa$humedad~dfa$almidon)

Creamos un cluster de los datos, para 3 centros

set.seed(1990)

#Perform k-means clustering with k=4 clusters
km <- kmeans(dfb, centers = 3, nstart = 25)

fviz_cluster(km, data = dfb)

aggregate(dfb, by=list(cluster=km$cluster), mean)
##   cluster    humedad     almidon   proteina
## 1       1 -1.1650820 -1.18443389  1.2604144
## 2       2  0.4212798  0.04391756 -0.3166988
## 3       3  0.7438022  1.14051632 -0.9437156