library(data.table)
library(tm)
## Loading required package: NLP
library(proxy)
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(fpc)
library(wordcloud)
## Loading required package: RColorBrewer
library(cluster)
library(stringi)
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(seriation)
library(clustertend)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(splitstackshape)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(readr)
# dist_matrix <- read_csv("~/dist_matrix_revised.csv", col_names = FALSE)
# colnames(dist_matrix)<-c("tuple_a","tuple_b","overlap","max","value")
# dist_matrix
#
# dist_matrix$v<-1-dist_matrix$value
# table(dist_matrix$v)
# library(data.table)
# master_input_cluster <- dist_matrix[,c("tuple_a","tuple_b","v")]
#
# master_input_cluster<-data.table(master_input_cluster)
#
# final<-dcast(master_input_cluster,tuple_a~tuple_b,fun.aggregate = mean,value.var ="v")
# write.csv(final,"casted_output.csv",na = "",row.names = FALSE)
## creating distance matrix ##
final <- read_csv("~/casted_output.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## tuple_a = col_character(),
## `tdr1tv-tdr1xv` = col_integer(),
## `tdr1tv-tdr1yf` = col_integer(),
## `tdr1tz-tdr1v6` = col_integer(),
## `tdr1tz-tdr1w5` = col_integer(),
## `tdr1tz-tdr3b4` = col_integer(),
## `tdr1v2-tdr1vb` = col_integer(),
## `tdr1v3-tdr1vb` = col_integer(),
## `tdr1v3-tdr1w4` = col_integer(),
## `tdr1v6-tdr1v2` = col_integer(),
## `tdr1v6-tdr1v6` = col_integer(),
## `tdr1v6-tdr1xf` = col_integer(),
## `tdr1v6-tdr1xp` = col_integer(),
## `tdr1v6-tdr1yc` = col_integer(),
## `tdr1v6-tdr38q` = col_integer(),
## `tdr1v7-tdr1v3` = col_integer(),
## `tdr1v7-tdr1v8` = col_integer(),
## `tdr1v7-tdr1v9` = col_integer(),
## `tdr1v7-tdr1ve` = col_integer(),
## `tdr1v7-tdr1vg` = col_integer()
## # ... with 1407 more columns
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 17321 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1004 tdr1x8-tdr3bj no trailing characters .947368421052632 file 2 1004 tdr1x9-tdr3bh no trailing characters .947368421052632 row 3 1004 tdr1x9-tdr3bj no trailing characters .947368421052632 col 4 1005 tdr1x9-tdr3b5 no trailing characters .75 expected 5 1005 tdr3b4-tdr3be no trailing characters .952380952380952 actual # ... with 1 more variables: file <chr>

## See problems(...) for more details.
final<-as.data.frame(tbl_df(final))
## checking if data can be clustered ##
library(factoextra)
library(seriation)
library(clustertend)
matrix.please<-function(x) {
m<-as.matrix(x[,-1])
rownames(m)<-x[,1]
m
}
df_scaled <- matrix.please(final)
distMatrix<-dist(df_scaled)
## to determine goodness of fit for Multidimensional scaling ##
gof1=c()
gof2=c()
for(i in seq(1,100,by=2)){
fit <- cmdscale(distMatrix, eig = T, k = i)
gof1=c(gof1,fit$GOF[1])
gof2=c(gof2,fit$GOF[2])
}
qplot(seq(1,100,by=2),gof1) + ylim(0,0.2)

# picking no of dimensions =2 and plotting output as per cluster#
mds_dim=2
data<-cmdscale(distMatrix,k=mds_dim,eig = TRUE)
x <- data$points[,1]
y <- data$points[,2]
# plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2",
# main="Metric MDS", type="n")
# text(x, y, labels = row.names(df_scaled), cex=.7)
library(magrittr)
library(dplyr)
library(ggpubr)
#
# ggscatter(data, x = "Dim.1", y = "Dim.2",
# label = rownames(df_scaled),
# size = 1,
# repel = TRUE)
df=as.data.frame(data$points)
df$id=row.names(df_scaled)
clust <- kmeans(df[,1:mds_dim], 5)$cluster %>%
as.factor()
mds <- df[,1:mds_dim] %>%
mutate(groups = clust)
# Plot and color by groups
ggscatter(mds, x = "V1", y = "V2",
label = rownames(df),
color = "groups",
palette = "jco",
size = 1,
ellipse = TRUE,
ellipse.type = "convex",
repel = TRUE)

#output for no of dimensions = 8
mds_dim=8
data<-cmdscale(distMatrix,k=mds_dim,eig = TRUE)
new_df=as.data.frame(data$points)
new_df$id=row.names(df_scaled)
#for optimal k in k-means clustering
kpi=c()
for(i in 1:50){
cl=kmeans(new_df[,1:mds_dim],centers=i,iter.max = 800, nstart = 4)
kpi=c(kpi,cl$tot.withinss)
}
qplot(1:50,kpi)

#k means clustering
library(clustertend)
cl=kmeans(new_df[,1:mds_dim],centers=16,iter.max = 800, nstart = 4)
new_df$key=cl$cluster
## Assessing goodness of clustering ##
hc_stats <- cluster.stats(new_df[,1:mds_dim], cl$cluster)
## Warning in as.dist.default(d): non-square matrix
## Warning in df[row(df) > col(df)] <- x: number of items to replace is not a
## multiple of replacement length
hc_stats$within.cluster.ss
## [1] 5.406709
compactness<-sqrt(hc_stats$within.cluster.ss
/hc_stats$cluster.number)
compactness
## [1] 0.5813083
separation<-hc_stats$min.separation
separation
## [1] -0.7627614
hc_stats$wb.ratio
## [1] -0.7868989
# final$cluster<-kmeans.hc$cluster
output<-data.table(new_df$id,new_df$key)
colnames(output)<-c("tuple","cluster")