library(readxl)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(cvms)
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rgl)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
## corrplot 0.92 loaded
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
Cargando datos
dfa = read_excel("C:/Users/daorjuelah/Downloads/chaa.xlsx")
head(dfa)
## # A tibble: 6 x 4
## humedad almidon proteina harina
## <dbl> <dbl> <dbl> <chr>
## 1 7.72 45.9 10.1 cha
## 2 7.65 46.0 10.6 cha
## 3 9.12 49.3 11.9 cha
## 4 10.3 49.3 10.9 cha
## 5 9.73 53.7 11.8 cha
## 6 8.03 44.9 10.6 cha
CreaciĂ³n de una matriz de distancias
#usamos solo los valores cuantitativos (quitamos la ultima columna que es una etiqueta)
dist_mat <- dist(dfa[, -4], method = 'euclidean')
dim(as.matrix(dist_mat))
## [1] 90 90
Plot del mapa de calor de cluster
heatmap(as.matrix(dist_mat))
Mapa de calor con la matriz de correlaciones
R <- cor(dfa[, -4]); R
## humedad almidon proteina
## humedad 1.0000000 0.8697394 -0.6366744
## almidon 0.8697394 1.0000000 -0.7928747
## proteina -0.6366744 -0.7928747 1.0000000
heatmap(R)
Plot matriz de correlaciones
#Usamos la libreria Corrplot
corrplot(R, method = 'number')
corrplot(R, method = 'ellipse')
corrplot.mixed(R)
patrones de agrupaciĂ³n en graficos de dispersiĂ³n
chart.Correlation(dfa[,-4])
Graficamos los dendogramas, metodo vecino cercano
Hierar_cl <- hclust(dist_mat, method = 'single'); Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
abline(h = 12, col = 'red')
#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)
## mod1
## 1 2 3
## 27 3 60
#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")
Matriz de confusion
table(dfa$harina, mod1)
## mod1
## 1 2 3
## ama 0 0 30
## arr 0 0 30
## cha 27 3 0
Porcentaje de aciertos de clasificacion
porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass
## [1] 63.33333 1140.00000
Graficamos los dendogramas, metodo vecino mas lejano
Hierar_cl <- hclust(dist_mat, method = 'complete'); Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
abline(h = 12, col = 'red')
#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)
## mod1
## 1 2 3
## 27 34 29
#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")
Matriz de confusion
table(dfa$harina, mod1)
## mod1
## 1 2 3
## ama 0 30 0
## arr 0 1 29
## cha 27 3 0
#Segundo metodo de ploteo con libreria tibble
df_cm = tibble(prediction = factor(mod1,
c(2,1,3),
c('ama','arr','cha'),
ordered = TRUE),
target = factor(dfa$harina,
ordered = TRUE)
)
cm = as_tibble(table(df_cm))
plot_confusion_matrix(cm,
target_col = 'target',
prediction_col = 'prediction',
counts_col = 'n')
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'rsvg' is missing. Will not plot arrows and zero-shading.
Porcentaje de aciertos de clasificacion
porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass
## [1] 95.55556 1720.00000
Graficamos los dendogramas, suma de cuadrados por las varianzas
Hierar_cl <- hclust(dist_mat, method = 'ward.D'); Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
abline(h = 12, col = 'red')
#Grupos separados por numero de subgrupos
mod1 <- cutree(Hierar_cl, k = 3)
dfa$grupos=as.factor(mod1)
table(mod1)
## mod1
## 1 2 3
## 27 34 29
#Creamos rectangulos que contengan los cluster a corte 3
rect.hclust(Hierar_cl, k = 3, border = "green")
Matriz de confusion
table(dfa$harina, mod1)
## mod1
## 1 2 3
## ama 0 30 0
## arr 0 1 29
## cha 27 3 0
Porcentaje de aciertos de clasificacion
porc_CorrClass <- ((table(dfa$harina, mod1)[1, 2] + table(dfa$harina, mod1)[2, 3] + table(dfa$harina, mod1)[3, 1])/dim(dfa))*100; porc_CorrClass
## [1] 95.55556 1720.00000
Estandarlizar la muestra para poder utilizar los metodos a continuacion
dfb=scale(dfa[, 1:3])
m <- c("average", "single", "complete", "ward")
names(m) <- c("average", "single", "complete", "ward")
ac <- function(x){agnes(dfb, method = x)$ac}
sapply(m, ac)
## average single complete ward
## 0.9183692 0.8615611 0.9460516 0.9868489
Calculando gap statistic for each number of clusters
#Determinar el mejor numero de cluster para el conjunto de datos
gap_stat <- clusGap(dfb,
FUN = hcut,
nstart = 25,
K.max = 10,
B = 50)
#Produce plot of clusters vs gap statistic
fviz_gap_stat(gap_stat)
dfa$clust <- mod1
#Separar y visualizar los resultados
b <- dfa %>% group_by(clust) %>% summarise(mean(humedad),mean(almidon),mean(proteina))
#table(b)
Grafica de sedimentacion para indicar el numero de clusters se deberian tomar
fviz_nbclust(dfb, kmeans, method = 'wss')
y = cbind(dfa$humedad, dfa$almidon, dfa$proteina)
mod2 = manova(y~dfa$harina)
summary(mod2)
## Df Pillai approx F num Df den Df Pr(>F)
## dfa$harina 2 1.4658 78.65 6 172 < 2.2e-16 ***
## Residuals 87
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Visualizacion en 3d de los datos
plot3d(dfa$humedad, dfa$almidon, dfa$proteina, col = c('red', 'blue', 'green'))
cols <- ifelse(dfa$harina == 'cha', 1, ifelse(dfa$harina == 'ama', 2, 3))
#plot_ly(dfa[, 1:3], x = ~humedad, y = ~almidon, z = ~proteina, colors = cols, size = 5)
plot3d(dfa$humedad, dfa$almidon, dfa$proteina, col = cols, size = 5)
VisualizaciĂ³n en 2d de correlacion
plot(dfa$humedad~dfa$almidon)
Creamos un cluster de los datos, para 3 centros
set.seed(1990)
#Perform k-means clustering with k=4 clusters
km <- kmeans(dfb, centers = 3, nstart = 25)
fviz_cluster(km, data = dfb)
aggregate(dfb, by=list(cluster=km$cluster), mean)
## cluster humedad almidon proteina
## 1 1 -1.1650820 -1.18443389 1.2604144
## 2 2 0.4212798 0.04391756 -0.3166988
## 3 3 0.7438022 1.14051632 -0.9437156