#Clustering of scientific articles and objectives
## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.4.2
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
A Binary Matrix has been prepared with 140 observations and 11 objectives, with presence/absence values 1/0.
mydata <- read.csv("datasetforR5-LCA.csv", sep=";", header = TRUE)
print(count(mydata))
## n
## 1 115
par(mfrow=c(1,1)) # Set the plot to 1 row, 2 columns
#mybinarymap <- heatmap(data.matrix(mydata), Rowv=NA, Colv=NA, col = c("white","black"))
heatmaply(mydata,
dendogram = "none",
Rowv = FALSE,
Colv = FALSE,
col = c("grey", "black"),
hide_colorbar = TRUE,
grid_gap = 2,
cexRow=.5,
grid_size = 2,
margins = c (10,10)) %>% layout(width=800, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
Using the method ‘binary’ the distance of the Matrix will be calculared
d_binary <- dist(mydata, method="binary") # First, construct a distance matrix.
With the distance matrix the clustering is done using the complete method for balanced and tighers clusters
#method_option = reactive({c(input$fitoption)})
method_option <- "complete"
fit <- hclust(d_binary, method = method_option)
Exploring dendograms with 8 clusters
#selectInput("clustoption", label = "Cluster:", choices=c(2,3,4,5,6,7,8,9,10), selected=8)
print(clustoption <- as.numeric("8"))
## [1] 8
Then we can draw the dendograms with boxes and with color branches
# renderPlot({
# par(mfrow=c(1,1)) # Set the plot to 1 row, 2 columns
plot(fit, cex=0.8) # cex refers to font size in the plot
# rect.hclust(fit, k = as.numeric(input$clustoption)) # k refers to the number of clusters
rect.hclust(fit, k = as.numeric(clustoption)) # k refers to the number of clusters
# })
#renderPlot({
# par(mfrow=c(1,1)) # Set the plot to 1 row, 2 columns
# plot(color_branches(as.dendrogram(fit), k = as.numeric(input$clustoption)))
plot(color_branches(as.dendrogram(fit), k = clustoption))
# })
Use summary statistics to explore the cluster characteristics.
#renderTable({
segment_objectives <- mutate(mydata, cluster = cutree(fit, k = clustoption))
#counting the clusters
count(segment_objectives, cluster)
#})
#renderTable({
# segment_objectives <- mutate(mydata, cluster = cutree(fit, k = clustoption))
segment_objectives %>%
group_by(cluster) %>%
summarise_all(mean)
#})
#renderPlot({
#plotting clusters but papers overlaps in the 1 and 0 values
# range_papers <- c(1:as.numeric(count(mydata)))
# segment_objectives <- mutate(mydata, cluster = cutree(fit, k = #as.numeric(input$clustoption)))
# segment_objectives %>%
# mutate(Paper_Id = range_papers) %>%
# gather(key = "objectives",
# value = "presence",
# c(Energy_reduction, Socioeconomic_evaluation, Thermal_comfort, User_satisfaction, #GHG_reduction, Economic_evaluation, Solution_development, Model_calibration, Retrofit_proposal, Retrofit_effectiveness)) %>%
# ggplot(aes(x = factor(objectives), y = presence, color = factor(cluster))) +
# geom_point(aes(group = Paper_Id))
#})
#renderPlot({
groups <- cutree(fit, k=clustoption)
clusplot(mydata, groups, color=TRUE, shade = TRUE, labels=5, lines=0)
#})
To evaluate if the clusters are similar enough we calculate the average of Jaccard similarity index. We boostrap 1000 times.
bsamples <- 1000
#renderTable({
clus.boot <- clusterboot(mydata,
B=bsamples, # Number of bootstrap resamples
clustermethod=hclustCBI, # for hierarchical clustering
method=method_option,
dmethod="binary",
cmethod=method_option, # use what we used in "hclust"
k=clustoption,
count=FALSE) # Show progress on screen?
set.seed(8675309)
AvgJaccard <- clus.boot$bootmean
Instability <- clus.boot$bootbrd/bsamples
# Clusters <- c(1:as.numeric(input$clustoption))
Clusters <- c(1:clustoption)
Eval <- cbind(Clusters, AvgJaccard, Instability)
Eval
## Clusters AvgJaccard Instability
## [1,] 1 0.5383904 0.500
## [2,] 2 0.4048511 0.799
## [3,] 3 0.3603934 0.804
## [4,] 4 0.3698985 0.857
## [5,] 5 0.5705541 0.469
## [6,] 6 0.4313406 0.747
## [7,] 7 0.6388380 0.311
## [8,] 8 0.5018190 0.623
#})