#Clustering of scientific articles and objectives

Setup environment

## Package 'mclust' version 6.0.0
## Type 'citation("mclust")' for citing this R package in publications.
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## 
## ======================
## Welcome to heatmaply version 1.4.2
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================

Binary Matrix

A Binary Matrix has been prepared with 140 observations and 11 objectives, with presence/absence values 1/0.

  mydata <- read.csv("datasetforR5-LCA.csv", sep=";", header = TRUE)
  print(count(mydata))
##     n
## 1 115

Heatmap

par(mfrow=c(1,1)) # Set the plot to 1 row, 2 columns
#mybinarymap <- heatmap(data.matrix(mydata), Rowv=NA, Colv=NA, col = c("white","black"))

heatmaply(mydata,
          dendogram = "none",
          Rowv = FALSE,
          Colv = FALSE,
          col = c("grey", "black"),
          hide_colorbar = TRUE,
          grid_gap = 2,
          cexRow=.5,
          grid_size = 2, 
          margins = c (10,10)) %>% layout(width=800, height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Calculating binary distance

Using the method ‘binary’ the distance of the Matrix will be calculared

  d_binary <- dist(mydata, method="binary") # First, construct a distance matrix.

Hierarchical clustering

With the distance matrix the clustering is done using the complete method for balanced and tighers clusters

  #method_option = reactive({c(input$fitoption)})
  method_option <- "complete"
  fit <- hclust(d_binary, method = method_option)  

Dendogram Plots

Exploring dendograms with 8 clusters

#selectInput("clustoption", label = "Cluster:", choices=c(2,3,4,5,6,7,8,9,10), selected=8)
print(clustoption <- as.numeric("8"))
## [1] 8

Then we can draw the dendograms with boxes and with color branches

Dendogram with boxes

# renderPlot({
#  par(mfrow=c(1,1)) # Set the plot to 1 row, 2 columns
  plot(fit, cex=0.8) # cex refers to font size in the plot
#  rect.hclust(fit, k = as.numeric(input$clustoption)) # k refers to the number of clusters
  rect.hclust(fit, k = as.numeric(clustoption)) # k refers to the number of clusters

#  })

Dendograms with color branches

#renderPlot({
#  par(mfrow=c(1,1)) # Set the plot to 1 row, 2 columns
#  plot(color_branches(as.dendrogram(fit), k = as.numeric(input$clustoption)))
  plot(color_branches(as.dendrogram(fit), k = clustoption))

 # })

Plot objectives in segments

Use summary statistics to explore the cluster characteristics.

#renderTable({
  segment_objectives <- mutate(mydata, cluster = cutree(fit, k = clustoption))
  #counting the clusters
  count(segment_objectives, cluster)
#})

#renderTable({ 
#  segment_objectives <- mutate(mydata, cluster = cutree(fit, k = clustoption))
  segment_objectives %>% 
    group_by(cluster) %>% 
    summarise_all(mean)
#})

#renderPlot({
  #plotting clusters but papers overlaps in the 1 and 0 values
#  range_papers <- c(1:as.numeric(count(mydata)))
#  segment_objectives <- mutate(mydata, cluster = cutree(fit, k = #as.numeric(input$clustoption)))
#  segment_objectives %>%
#    mutate(Paper_Id = range_papers) %>%
#    gather(key = "objectives", 
#           value = "presence", 
#           c(Energy_reduction, Socioeconomic_evaluation, Thermal_comfort, User_satisfaction, #GHG_reduction, Economic_evaluation, Solution_development, Model_calibration,  Retrofit_proposal, Retrofit_effectiveness)) %>%
#    ggplot(aes(x = factor(objectives), y = presence, color = factor(cluster))) + 
#    geom_point(aes(group = Paper_Id))

#})

Plot of cluster clouds

#renderPlot({
  groups <- cutree(fit, k=clustoption)
  clusplot(mydata, groups, color=TRUE, shade = TRUE, labels=5, lines=0)

#})

Evaluation of clusters

To evaluate if the clusters are similar enough we calculate the average of Jaccard similarity index. We boostrap 1000 times.

bsamples <- 1000
#renderTable({
  clus.boot <- clusterboot(mydata, 
                         B=bsamples, # Number of bootstrap resamples
                         clustermethod=hclustCBI, # for hierarchical clustering 
                         method=method_option,
                         dmethod="binary",
                         cmethod=method_option, # use what we used in "hclust"
                         k=clustoption, 
                         count=FALSE) # Show progress on screen?

  set.seed(8675309)

  AvgJaccard <- clus.boot$bootmean
  Instability <- clus.boot$bootbrd/bsamples
#  Clusters <- c(1:as.numeric(input$clustoption))
  Clusters <- c(1:clustoption)
  Eval <- cbind(Clusters, AvgJaccard, Instability)
  Eval
##      Clusters AvgJaccard Instability
## [1,]        1  0.5383904       0.500
## [2,]        2  0.4048511       0.799
## [3,]        3  0.3603934       0.804
## [4,]        4  0.3698985       0.857
## [5,]        5  0.5705541       0.469
## [6,]        6  0.4313406       0.747
## [7,]        7  0.6388380       0.311
## [8,]        8  0.5018190       0.623
#})