This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(readxl)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
df1 = read_excel("C:/Users/fmore/Downloads/chaa.xlsx")
head(df1)
## # A tibble: 6 x 4
## humedad almidon proteina harina
## <dbl> <dbl> <dbl> <chr>
## 1 7.72 45.9 10.1 cha
## 2 7.65 46.0 10.6 cha
## 3 9.12 49.3 11.9 cha
## 4 10.3 49.3 10.9 cha
## 5 9.73 53.7 11.8 cha
## 6 8.03 44.9 10.6 cha
dist_mat = dist(df1[, -4], method = 'euclidean')
dim(as.matrix(dist_mat))
## [1] 90 90
heatmap(as.matrix(dist_mat))
library(corrplot)
## corrplot 0.92 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
R = cor(df1[, -4])
R
## humedad almidon proteina
## humedad 1.0000000 0.8697394 -0.6366744
## almidon 0.8697394 1.0000000 -0.7928747
## proteina -0.6366744 -0.7928747 1.0000000
heatmap(R)
corrplot:: corrplot(R, method = 'number')
corrplot.mixed(R)
corrplot(R,method = "pie")
#Pie es solo una manera de mostrar de una manera distinta los resultados
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(df1[, -4])
#visualizar correlaciones
#single = vecino más cercano
Hierar_cl =hclust(dist_mat, method ='single')
Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
abline(h = 2.5, col ='red')
#Complete = vecino más lejano
Hierar_cl =hclust(dist_mat, method ='complete')
Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
abline(h = 15, col ='red')
abline(h = 10, col ='green')
Hierar_cl =hclust(dist_mat, method ='ward.D')
Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
mod1 = cutree(Hierar_cl, k = 3)
rect.hclust(Hierar_cl, k =3, border = 'green')
#matriz de confusión
table(df1$harina, mod1)
## mod1
## 1 2 3
## ama 0 30 0
## arr 0 1 29
## cha 27 3 0
# se toman los datos correctos de cada variable en el modelo 100*(27+29+30)/90 amaranto=30, arr=29 y cha= 27 %de datos correctos = 95,6%
Hierar_cl =hclust(dist_mat, method ='complete')
Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
mod1 = cutree(Hierar_cl, k = 3)
rect.hclust(Hierar_cl, k =3, border = 'green')
#matriz de confusión
table(df1$harina, mod1)
## mod1
## 1 2 3
## ama 0 30 0
## arr 0 1 29
## cha 27 3 0
# se toman los datos correctos de cada variable en el modelo 100*(27+29+30)/90 amaranto=30, arr=29 y cha= 27 %de datos correctos = 95,6%
Hierar_cl =hclust(dist_mat, method ='single')
Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
mod1 = cutree(Hierar_cl, k = 3)
rect.hclust(Hierar_cl, k =3, border = 'green')
#matriz de confusión
table(df1$harina, mod1)
## mod1
## 1 2 3
## ama 0 0 30
## arr 0 0 30
## cha 27 3 0
# se toman los datos correctos de cada variable en el modelo 100*(27+29+30)/90 amaranto=30, arr=30 y cha= 27 %de datos correctos = 96,7%
Hierar_cl =hclust(dist_mat, method ='ward.D')
Hierar_cl
##
## Call:
## hclust(d = dist_mat, method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 90
plot(Hierar_cl)
mod1 = cutree(Hierar_cl, k = 3)
rect.hclust(Hierar_cl, k =3, border = 'green')
#matriz de confusión
table(df1$harina, mod1)
## mod1
## 1 2 3
## ama 0 30 0
## arr 0 1 29
## cha 27 3 0
# se toman los datos correctos de cada variable en el modelo 100*(27+29+30)/90 amaranto=30, arr=29 y cha= 27 %de datos correctos = 95,6%
library(cvms)
library(tibble)
df_cm = tibble(prediction = factor(mod1, c(2,1,3), c('ama', 'arr', 'cha'),ordered = T), target = factor(df1$harina, ordered = T))
cm = as_tibble(table(df_cm))
plot_confusion_matrix(cm, target_col = 'target', prediction_col = 'prediction', counts_col = 'n')
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(cm, target_col = "target", prediction_col =
## "prediction", : 'rsvg' is missing. Will not plot arrows and zero-shading.
#Puede llegar a ser muy util para identificar la eficacia de los 4 métodos
library(cluster)
dfb = scale(df1[, 1:3])
m = c('average', 'single', 'complete', 'ward')
names(m) = c('average', 'single', 'complete', 'ward')
ac = function(x) {agnes (dfb, method = x)$ac}
sapply(m,ac)
## average single complete ward
## 0.9183692 0.8615611 0.9460516 0.9868489
library(factoextra)
gap_stat <- clusGap(dfb, FUN = hcut,nstart = 25,K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
library(dplyr)
df1$clust= mod1
df1 %>%
group_by(clust)%>%
summarise(mh<-mean(humedad),
ma<-mean(almidon),
mp<-mean(proteina))
## # A tibble: 3 x 4
## clust `mh <- mean(humedad)` `ma <- mean(almidon)` `mp <- mean(proteina)`
## <int> <dbl> <dbl> <dbl>
## 1 1 7.97 45.5 10.6
## 2 2 10.6 57.7 8.66
## 3 3 11.2 68.7 7.48
y = cbind (df1$humedad, df1$almidon, df1$proteina); y
## [,1] [,2] [,3]
## [1,] 7.721578 45.90128 10.139913
## [2,] 7.652724 46.04431 10.598239
## [3,] 9.123838 49.27112 11.897356
## [4,] 10.268620 49.29338 10.925669
## [5,] 9.725898 53.65593 11.762685
## [6,] 8.032232 44.87319 10.588989
## [7,] 7.668702 44.41084 10.269907
## [8,] 8.297506 47.25879 11.049940
## [9,] 7.079020 42.37882 10.128377
## [10,] 8.372860 46.35557 10.890065
## [11,] 9.114401 48.73582 11.170282
## [12,] 7.624699 44.52850 10.061102
## [13,] 7.320832 43.71180 10.838241
## [14,] 7.918740 47.71021 10.994711
## [15,] 7.511419 47.13314 10.973178
## [16,] 7.432836 42.15872 10.452795
## [17,] 7.048295 42.45157 10.653445
## [18,] 9.495883 54.26860 12.213783
## [19,] 9.566397 52.33493 11.850099
## [20,] 8.428448 46.49959 10.900030
## [21,] 7.137224 41.30813 9.902352
## [22,] 7.996107 43.88332 10.601019
## [23,] 7.857408 46.03987 10.083055
## [24,] 8.475420 46.82790 10.963417
## [25,] 7.770122 44.57957 10.869224
## [26,] 8.337881 46.23931 10.272636
## [27,] 7.322514 43.44046 10.863603
## [28,] 7.120598 44.54915 10.050192
## [29,] 8.086297 47.17898 10.392180
## [30,] 8.476982 46.36449 10.825985
## [31,] 11.406200 61.52768 8.854925
## [32,] 11.865100 62.42881 9.582678
## [33,] 10.994501 56.17152 8.378672
## [34,] 8.547076 54.75710 7.944448
## [35,] 10.139319 58.35008 8.047774
## [36,] 11.251462 59.55176 8.516003
## [37,] 10.721676 56.58344 8.620696
## [38,] 11.169609 55.60496 8.312420
## [39,] 11.709281 58.70984 8.901657
## [40,] 10.631209 59.47148 8.164276
## [41,] 11.837275 61.97836 8.696064
## [42,] 10.899388 60.19528 8.676086
## [43,] 11.034178 59.11214 8.234401
## [44,] 10.364361 56.64930 8.074072
## [45,] 9.453628 55.83020 8.448407
## [46,] 11.304953 59.98648 8.208168
## [47,] 11.817252 58.19939 8.569444
## [48,] 8.854873 53.88221 7.792924
## [49,] 11.705227 56.49285 9.178956
## [50,] 9.864662 57.04852 7.720985
## [51,] 10.976432 58.43758 8.014868
## [52,] 9.758869 53.97065 7.716782
## [53,] 11.104813 54.70654 8.511586
## [54,] 10.752708 62.26898 8.319577
## [55,] 10.770620 56.78398 8.050882
## [56,] 10.874112 59.70302 9.000629
## [57,] 10.591282 60.77411 9.171865
## [58,] 11.443537 60.78443 9.100978
## [59,] 10.878284 57.91546 8.019242
## [60,] 7.694603 52.56606 7.072489
## [61,] 11.639415 69.06842 7.613599
## [62,] 10.560772 66.15191 7.179469
## [63,] 12.900343 70.87005 8.780711
## [64,] 11.536620 73.11278 8.425234
## [65,] 10.154163 61.85292 6.785652
## [66,] 11.339808 68.91843 7.454255
## [67,] 10.168894 66.09209 6.453378
## [68,] 12.010602 70.16086 8.223884
## [69,] 11.194967 72.04267 7.661631
## [70,] 10.411941 66.11417 6.885927
## [71,] 10.869293 65.10160 7.136257
## [72,] 10.965178 71.73988 7.507435
## [73,] 10.592406 66.33288 7.683299
## [74,] 10.677790 66.10951 6.824915
## [75,] 11.148594 70.48496 7.224137
## [76,] 11.253639 66.65250 7.103698
## [77,] 12.338184 71.15704 8.292266
## [78,] 11.690715 69.72544 7.711396
## [79,] 9.977745 68.58767 7.420820
## [80,] 10.762089 66.60586 7.493518
## [81,] 11.902305 70.32080 7.424477
## [82,] 11.073847 66.09679 6.724237
## [83,] 10.378991 69.68403 7.365961
## [84,] 11.920298 67.07839 7.663896
## [85,] 10.146433 71.52956 7.488857
## [86,] 12.372345 71.70629 7.511267
## [87,] 10.926828 65.26694 6.679849
## [88,] 11.028630 65.52440 7.168652
## [89,] 11.240799 68.61351 7.801128
## [90,] 12.772005 71.14063 7.872976
mod2= manova(y~df1$harina)
summary(mod2)
## Df Pillai approx F num Df den Df Pr(>F)
## df1$harina 2 1.4658 78.65 6 172 < 2.2e-16 ***
## Residuals 87
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(rgl)
cols = ifelse(df1$harina == "cha", 1,
ifelse(df1$harina == "amma", 2, 3))
plot(df1$humedad, df1$almidon, col = cols)
plot3d(df1$proteina, col=cols, size = 5)
set.seed(1)
km = kmeans(df1[,-4], centers = 3, nstart = 25)
km
## K-means clustering with 3 clusters of sizes 29, 34, 27
##
## Cluster means:
## humedad almidon proteina clust
## 1 11.23453 68.68931 7.475073 3
## 2 10.56938 57.72219 8.662182 2
## 3 7.97027 45.52325 10.642811 1
##
## Clustering vector:
## [1] 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 193.4449 362.0906 139.2457
## (between_SS / total_SS = 91.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(km, data = df1 [,-4])
aggregate(df1 [,-4], by=list(cluster=km$cluster), mean)
## cluster humedad almidon proteina clust
## 1 1 11.23453 68.68931 7.475073 3
## 2 2 10.56938 57.72219 8.662182 2
## 3 3 7.97027 45.52325 10.642811 1