rm(list=ls())               # Limpiamos todos los objetos creados en R
graphics.off()              # Limpiamos los grĂ¡ficos
options(digits = 3)         # Declaramos dĂ­gitos despues del punto para observar (decimas, centesimas,...)
set.seed(12345) #fijar semilla de aleatoriedad
#setwd("/cloud/project/Skycrops") #fijar directorio de preferencia

Clustering

library(tm)
library(dbscan)
library(proxy)
library(colorspace)

load("BD.RData")
load("BD_unq.RData")

ConstrucciĂ³n de la matriz de distancia

sentences <-BD_unq$Abstract

corpus = tm::Corpus(tm::VectorSource(sentences))

# Cleaning up
# Handling UTF-8 encoding problem from the dataset
corpus.cleaned <- tm::tm_map(corpus, function(x) iconv(x, to='UTF-8', sub='byte')) 
## Warning in tm_map.SimpleCorpus(corpus, function(x) iconv(x, to = "UTF-8", :
## transformation drops documents
corpus.cleaned <- tm::tm_map(corpus.cleaned, tm::removeWords, tm::stopwords('english')) # Removing stop-words
## Warning in tm_map.SimpleCorpus(corpus.cleaned, tm::removeWords,
## tm::stopwords("english")): transformation drops documents
corpus.cleaned <- tm::tm_map(corpus.cleaned, tm::stemDocument, language = "english") # Stemming the words 
## Warning in tm_map.SimpleCorpus(corpus.cleaned, tm::stemDocument, language =
## "english"): transformation drops documents
corpus.cleaned <- tm::tm_map(corpus.cleaned, tm::stripWhitespace) # Trimming excessive whitespaces
## Warning in tm_map.SimpleCorpus(corpus.cleaned, tm::stripWhitespace):
## transformation drops documents
# Building the feature matrices
tdm <- tm::DocumentTermMatrix(corpus.cleaned)
tdm.tfidf <- tm::weightTfIdf(tdm)

# We remove A LOT of features. R is natively very weak with high dimensional matrix
tdm.tfidf <- tm::removeSparseTerms(tdm.tfidf, 0.999)

# There is the memory-problem part
# - Native matrix isn't "sparse-compliant" in the memory
# - Sparse implementations aren't necessary compatible with clustering algorithms
tfidf.matrix <- as.matrix(tdm.tfidf)
# Cosine distance matrix (useful for specific clustering algorithms)
dist.matrix = proxy::dist(tfidf.matrix, method = "cosine")

Cluster con k=10 arbitrario

library(tidyverse)  # data manipulation
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.3.2     <U+2713> purrr   0.3.3
## <U+2713> tibble  3.0.4     <U+2713> dplyr   1.0.6
## <U+2713> tidyr   1.1.3     <U+2713> stringr 1.4.0
## <U+2713> readr   1.4.0     <U+2713> forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## -- Conflicts -------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library(cluster)    # clustering algorithms
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df=cmdscale(dist.matrix, k = 2)


clustering.kmeans <- kmeans(df, 5)

BD_unq = BD_unq %>%
  mutate(cluster = as.factor(clustering.kmeans$cluster ), 
         point_x =  cmdscale(dist.matrix, k = 2)[,1],
         point_y = cmdscale(dist.matrix, k = 2)[,2])

BD_unq %>%
  ggplot(aes(x= point_x, y=point_y, colour =cluster)) + geom_point()+
  labs(title="K-Means clustering")

#elbow method
set.seed(123)

fviz_nbclust(df, kmeans, method = "wss",k.max = 20)

fviz_nbclust(df, kmeans, method = "silhouette",k.max = 20)

Obtener cluster

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
optik=5
clustering.kmeans <- kmeans(df, optik)

BD_unq = BD_unq %>%
  mutate(cluster = as.factor(clustering.kmeans$cluster ))


plot1= BD_unq %>%
  ggplot(aes(x= point_x, y=point_y, colour =cluster)) + geom_point()+
  labs(title="K-Means clustering")

plot2= BD_unq %>%
  count(cluster, sort = TRUE) %>%
  mutate(nsum = cumsum(n)- 0*n)  %>%
  ggplot(aes(x="", y=n, fill=cluster))+
  geom_bar(width = 1, stat = "identity", color = "white") +
  #coord_polar("y", start=0)+
  #geom_text(aes(y = nsum, label = n), color = "white")+
  labs(x="", y="", title= "Tamaño de los clusters")

do.call("grid.arrange", c(list(plot1, plot2), ncol = 2)) 

library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
data(stop_words)


for (i in 1:max(clustering.kmeans$cluster)) {
  plot1 = BD_unq %>%
    filter(cluster == i) %>%
    select(Abstract) %>%    unnest_tokens(word, Abstract) %>%
    anti_join(stop_words) %>%
    count(word, sort = TRUE) %>%
    #filter(n >= quantile(n,0.9)  & word > "a") %>%
    mutate(word = reorder(word, n)) %>%
    slice_head(n = 25) %>%
    ggplot(aes(n, word)) +
    geom_col() +
    labs(title= paste("Top abstract's words cluster",as.character(i)), y=NULL, x= "Frecuencia")
  
  plot2 = BD_unq %>%
    filter(cluster == i) %>%
    select(Title) %>%
    unnest_tokens(word, Title) %>%
    anti_join(stop_words) %>%
    count(word, sort = TRUE) %>%
    #filter(n >= quantile(n,0.9)  & word > "a") %>%
    mutate(word = reorder(word, n)) %>%
    slice_head(n = 25) %>%
    ggplot(aes(n, word)) +
    geom_col() +
    labs(title= paste("Top title's words cluster",as.character(i)), y=NULL, x= "Frecuencia")
  
  do.call("grid.arrange", c(list(plot1, plot2), ncol = 2)) 
  
}
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"

## Joining, by = "word"
## Joining, by = "word"

## Joining, by = "word"
## Joining, by = "word"

## Joining, by = "word"
## Joining, by = "word"

#

RelaciĂ³n de las palabras en los abstract de los clusters

count_bigrams <- function(dataset) {
  dataset %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    count(word1, word2, sort = TRUE)
}

visualize_bigrams <- function(bigrams) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
  
  bigrams %>%
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = "lightblue", size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}
library(stringr)
for (i in 1:max(as.numeric(BD_unq$cluster))) {
  plot= BD_unq %>%
    filter(as.numeric(cluster)== i) %>%
    mutate(pp=10)%>%
    select(pp, text=Abstract) %>%
    count_bigrams()%>%
    filter(!str_detect(word1, "\\d"),
           !str_detect(word2, "\\d")) %>%
    mutate(word1 = reorder(word1, n)) %>%
    slice_head(n = 30)%>%
    visualize_bigrams() +labs(title = paste("RelaciĂ³n entre palabras en Abstract en cluster",as.character(i)))
  
  do.call("grid.arrange", c(list(plot), ncol = 1)) 
  
  
}

Analisis de los IPC de las patentes

library(stringr)
for (i in 1:max(as.numeric(BD_unq$cluster))) {
  plot= BD_unq %>%
    filter(as.numeric(cluster)== i) %>%
    mutate(pp=10)%>%
    select(pp, text=Title) %>%
    count_bigrams()%>%
    filter(!str_detect(word1, "\\d"),
           !str_detect(word2, "\\d")) %>%
    mutate(word1 = reorder(word1, n)) %>%
    slice_head(n = 30)%>%
    visualize_bigrams() +labs(title = paste("RelaciĂ³n entre palabras en Abstract en cluster",as.character(i)))
  
  do.call("grid.arrange", c(list(plot), ncol = 1)) 
  
  
}