print

library(data.table)
library(tm)

## Loading required package: NLP

library(proxy)

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(fpc)   
library(wordcloud)

## Loading required package: RColorBrewer

library(cluster)
library(stringi)
library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(seriation)
library(clustertend)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(splitstackshape)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

library(readr)

# dist_matrix <- read_csv("~/dist_matrix_revised.csv", col_names = FALSE)
# colnames(dist_matrix)<-c("tuple_a","tuple_b","overlap","max","value")
# dist_matrix
# 
# dist_matrix$v<-1-dist_matrix$value
# table(dist_matrix$v)

# library(data.table)
# master_input_cluster <- dist_matrix[,c("tuple_a","tuple_b","v")]
# 
# master_input_cluster<-data.table(master_input_cluster)
# 
# final<-dcast(master_input_cluster,tuple_a~tuple_b,fun.aggregate = mean,value.var ="v")

# write.csv(final,"casted_output.csv",na = "",row.names = FALSE)

## creating distance matrix ##

final <- read_csv("~/casted_output.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   tuple_a = col_character(),
##   `tdr1tv-tdr1xv` = col_integer(),
##   `tdr1tv-tdr1yf` = col_integer(),
##   `tdr1tz-tdr1v6` = col_integer(),
##   `tdr1tz-tdr1w5` = col_integer(),
##   `tdr1tz-tdr3b4` = col_integer(),
##   `tdr1v2-tdr1vb` = col_integer(),
##   `tdr1v3-tdr1vb` = col_integer(),
##   `tdr1v3-tdr1w4` = col_integer(),
##   `tdr1v6-tdr1v2` = col_integer(),
##   `tdr1v6-tdr1v6` = col_integer(),
##   `tdr1v6-tdr1xf` = col_integer(),
##   `tdr1v6-tdr1xp` = col_integer(),
##   `tdr1v6-tdr1yc` = col_integer(),
##   `tdr1v6-tdr38q` = col_integer(),
##   `tdr1v7-tdr1v3` = col_integer(),
##   `tdr1v7-tdr1v8` = col_integer(),
##   `tdr1v7-tdr1v9` = col_integer(),
##   `tdr1v7-tdr1ve` = col_integer(),
##   `tdr1v7-tdr1vg` = col_integer()
##   # ... with 1407 more columns
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 17321 parsing failures.
## row # A tibble: 5 x 5 col     row           col               expected           actual expected   <int>         <chr>                  <chr>            <chr> actual 1  1004 tdr1x8-tdr3bj no trailing characters .947368421052632 file 2  1004 tdr1x9-tdr3bh no trailing characters .947368421052632 row 3  1004 tdr1x9-tdr3bj no trailing characters .947368421052632 col 4  1005 tdr1x9-tdr3b5 no trailing characters              .75 expected 5  1005 tdr3b4-tdr3be no trailing characters .952380952380952 actual # ... with 1 more variables: file <chr>
## ... ................. ... ............................................................. ........ ............................................................. ...... ............................................................. .... ............................................................. ... ............................................................. ... ............................................................. ........ ............................................................. ...... .......................................
## See problems(...) for more details.

final<-as.data.frame(tbl_df(final))

## checking if data can be clustered ##
library(factoextra)
library(seriation)
library(clustertend)

matrix.please<-function(x) {
  m<-as.matrix(x[,-1])
  rownames(m)<-x[,1]
  m
}

df_scaled <- matrix.please(final)
distMatrix<-dist(df_scaled)

## to determine goodness of fit for Multidimensional scaling ##
gof1=c()
gof2=c()

for(i in seq(1,100,by=2)){
    fit <- cmdscale(distMatrix, eig = T, k = i)
    gof1=c(gof1,fit$GOF[1])
    gof2=c(gof2,fit$GOF[2])
}
qplot(seq(1,100,by=2),gof1) + ylim(0,0.2)

# picking no of dimensions =2 and plotting output as per cluster#

mds_dim=2
data<-cmdscale(distMatrix,k=mds_dim,eig = TRUE)

x <- data$points[,1]
y <- data$points[,2]
# plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2", 
#   main="Metric    MDS",   type="n")
# text(x, y, labels = row.names(df_scaled), cex=.7)

library(magrittr)
library(dplyr)
library(ggpubr)
# 
# ggscatter(data, x = "Dim.1", y = "Dim.2", 
#           label = rownames(df_scaled),
#           size = 1,
#           repel = TRUE)

df=as.data.frame(data$points)
df$id=row.names(df_scaled)

clust <- kmeans(df[,1:mds_dim], 5)$cluster %>%
  as.factor()

mds <- df[,1:mds_dim] %>%
  mutate(groups = clust)

# Plot and color by groups

ggscatter(mds, x = "V1", y = "V2", 
          label = rownames(df),
          color = "groups",
          palette = "jco",
          size = 1, 
          ellipse = TRUE,
          ellipse.type = "convex",
          repel = TRUE)

#output for no of dimensions = 8

mds_dim=8
data<-cmdscale(distMatrix,k=mds_dim,eig = TRUE)

new_df=as.data.frame(data$points)
new_df$id=row.names(df_scaled)


#for optimal k in k-means clustering
kpi=c()
for(i in 1:50){
    cl=kmeans(new_df[,1:mds_dim],centers=i,iter.max = 800, nstart = 4)
    kpi=c(kpi,cl$tot.withinss)
}

qplot(1:50,kpi)

#k means clustering
library(clustertend)

cl=kmeans(new_df[,1:mds_dim],centers=16,iter.max = 800, nstart = 4)
new_df$key=cl$cluster

## Assessing goodness of clustering ##

hc_stats <- cluster.stats(new_df[,1:mds_dim],  cl$cluster)

## Warning in as.dist.default(d): non-square matrix

## Warning in df[row(df) > col(df)] <- x: number of items to replace is not a
## multiple of replacement length

hc_stats$within.cluster.ss

## [1] 5.406709

compactness<-sqrt(hc_stats$within.cluster.ss
                  /hc_stats$cluster.number)
compactness

## [1] 0.5813083

separation<-hc_stats$min.separation
separation

## [1] -0.7627614

hc_stats$wb.ratio

## [1] -0.7868989

# final$cluster<-kmeans.hc$cluster
output<-data.table(new_df$id,new_df$key)  
colnames(output)<-c("tuple","cluster")