ConstrucciĂ³n de la matriz de distancia
sentences <-BD_unq$Abstract
corpus = tm::Corpus(tm::VectorSource(sentences))
# Cleaning up
# Handling UTF-8 encoding problem from the dataset
corpus.cleaned <- tm::tm_map(corpus, function(x) iconv(x, to='UTF-8', sub='byte'))
## Warning in tm_map.SimpleCorpus(corpus, function(x) iconv(x, to = "UTF-8", :
## transformation drops documents
corpus.cleaned <- tm::tm_map(corpus.cleaned, tm::removeWords, tm::stopwords('english')) # Removing stop-words
## Warning in tm_map.SimpleCorpus(corpus.cleaned, tm::removeWords,
## tm::stopwords("english")): transformation drops documents
corpus.cleaned <- tm::tm_map(corpus.cleaned, tm::stemDocument, language = "english") # Stemming the words
## Warning in tm_map.SimpleCorpus(corpus.cleaned, tm::stemDocument, language =
## "english"): transformation drops documents
corpus.cleaned <- tm::tm_map(corpus.cleaned, tm::stripWhitespace) # Trimming excessive whitespaces
## Warning in tm_map.SimpleCorpus(corpus.cleaned, tm::stripWhitespace):
## transformation drops documents
# Building the feature matrices
tdm <- tm::DocumentTermMatrix(corpus.cleaned)
tdm.tfidf <- tm::weightTfIdf(tdm)
# We remove A LOT of features. R is natively very weak with high dimensional matrix
tdm.tfidf <- tm::removeSparseTerms(tdm.tfidf, 0.999)
# There is the memory-problem part
# - Native matrix isn't "sparse-compliant" in the memory
# - Sparse implementations aren't necessary compatible with clustering algorithms
tfidf.matrix <- as.matrix(tdm.tfidf)
# Cosine distance matrix (useful for specific clustering algorithms)
dist.matrix = proxy::dist(tfidf.matrix, method = "cosine")
Cluster con k=10 arbitrario
library(tidyverse) # data manipulation
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.3.2 <U+2713> purrr 0.3.3
## <U+2713> tibble 3.0.4 <U+2713> dplyr 1.0.6
## <U+2713> tidyr 1.1.3 <U+2713> stringr 1.4.0
## <U+2713> readr 1.4.0 <U+2713> forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cluster) # clustering algorithms
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df=cmdscale(dist.matrix, k = 2)
clustering.kmeans <- kmeans(df, 5)
BD_unq = BD_unq %>%
mutate(cluster = as.factor(clustering.kmeans$cluster ),
point_x = cmdscale(dist.matrix, k = 2)[,1],
point_y = cmdscale(dist.matrix, k = 2)[,2])
BD_unq %>%
ggplot(aes(x= point_x, y=point_y, colour =cluster)) + geom_point()+
labs(title="K-Means clustering")

#elbow method
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss",k.max = 20)

fviz_nbclust(df, kmeans, method = "silhouette",k.max = 20)

Obtener cluster
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
optik=5
clustering.kmeans <- kmeans(df, optik)
BD_unq = BD_unq %>%
mutate(cluster = as.factor(clustering.kmeans$cluster ))
plot1= BD_unq %>%
ggplot(aes(x= point_x, y=point_y, colour =cluster)) + geom_point()+
labs(title="K-Means clustering")
plot2= BD_unq %>%
count(cluster, sort = TRUE) %>%
mutate(nsum = cumsum(n)- 0*n) %>%
ggplot(aes(x="", y=n, fill=cluster))+
geom_bar(width = 1, stat = "identity", color = "white") +
#coord_polar("y", start=0)+
#geom_text(aes(y = nsum, label = n), color = "white")+
labs(x="", y="", title= "Tamaño de los clusters")
do.call("grid.arrange", c(list(plot1, plot2), ncol = 2))

## Warning: package 'tidytext' was built under R version 3.6.3
data(stop_words)
for (i in 1:max(clustering.kmeans$cluster)) {
plot1 = BD_unq %>%
filter(cluster == i) %>%
select(Abstract) %>% unnest_tokens(word, Abstract) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
#filter(n >= quantile(n,0.9) & word > "a") %>%
mutate(word = reorder(word, n)) %>%
slice_head(n = 25) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(title= paste("Top abstract's words cluster",as.character(i)), y=NULL, x= "Frecuencia")
plot2 = BD_unq %>%
filter(cluster == i) %>%
select(Title) %>%
unnest_tokens(word, Title) %>%
anti_join(stop_words) %>%
count(word, sort = TRUE) %>%
#filter(n >= quantile(n,0.9) & word > "a") %>%
mutate(word = reorder(word, n)) %>%
slice_head(n = 25) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(title= paste("Top title's words cluster",as.character(i)), y=NULL, x= "Frecuencia")
do.call("grid.arrange", c(list(plot1, plot2), ncol = 2))
}
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"

## Joining, by = "word"
## Joining, by = "word"

## Joining, by = "word"
## Joining, by = "word"

## Joining, by = "word"
## Joining, by = "word"

