Conexiónes entre palabras

## Loading required package: NLP

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

Obtenemos tweets

tweets_collection <- mongo("tweets_mongo_covid19", url = params$con)
all <- tweets_collection$find(query = '{}')

Limpiamos tweets

stops <- tm::stopwords("spanish")
limpiar_tokenizar <- function(texto){
  nuevo_texto <- tolower(texto)
  nuevo_texto <- str_replace_all(nuevo_texto,"http\\S*", "")
  nuevo_texto <- str_replace_all(nuevo_texto,"[[:punct:]]", " ")
  nuevo_texto <- str_replace_all(nuevo_texto,"[[:digit:]]", " ")
  nuevo_texto <- str_replace_all(nuevo_texto,"[\\s]+", " ")
  nuevo_texto <- str_split(nuevo_texto, " ")[[1]]
  nuevo_texto <- keep(
    .x = nuevo_texto,
    .p = function(x){
      str_length(x) > 1 && ! (x %in% stops)
    }
  )

  return(nuevo_texto)
}

data <- head(all, params$limit)
tweets <- data %>%
  mutate(text_token = map(
    .x = text,
    .f = limpiar_tokenizar
  ))

head(tweets$text_token)

## [[1]]
## [1] "bueno"       "capitulo"    "rick"        "morty"       "coronavirus"
## 
## [[2]]
##  [1] "aquí"         "bonito"       "mapa"         "estadísticas" "avance"      
##  [6] "covid"        "méxico"       "covid"        "corona"       "virus"       
## [11] "mexico"       "covid"        "cuarentena"   "coronavirus"  "covidmx"     
## 
## [[3]]
##  [1] "willaxtv"       "milagrosleivag" "huacchillo"     "preguntan"     
##  [5] "cuanto"         "ejecutado"      "presupuesto"    "regional"      
##  [9] "cuanto"         "invierte"       "salud"          "lado"          
## [13] "gobierno"       "regional"       "dijeron"        "población"     
## [17] "seguía"         "haciendo"       "daba"           "gana"          
## [21] "día"            "cuarentena"     "medidas"        "tomaron"       
## 
## [[4]]
## [1] "salir"      "fiesta"     "cuarentena" "nacos"     
## 
## [[5]]
## [1] "dios"       "odio"       "mundo"      "quiero"     "pasar"     
## [6] "cuarentena" "solo"      
## 
## [[6]]
## [1] "unidos"       "dio"          "autorización" "usar"         "remdesivir"  
## [6] "tratar"       "pacientes"    "coronavirus"

Construimos vertices (nodos) y aristas (relaciones)

from <- c();
to <- c();

# print(nrow(tweets))

for (row in 1:nrow(tweets)) {
  # if (row %% 100 == 0) { print(row) }
  words <- tweets$text_token[[row]]
  l = length(words)
  if (l > 1) {
    end <- l - 1
    for (i in 1:end) {
      start <- i + 1
      word <- words[i]
      for (j in start:l) {
        current = words[j]
        
        if (word != current) {
          from <- c(from, word)
          to <- c(to, current)
        }
      }
    }
  }
}

rels <- data.frame(from = from, to = to)

head(rels)

##       from          to
## 1    bueno    capitulo
## 2    bueno        rick
## 3    bueno       morty
## 4    bueno coronavirus
## 5 capitulo        rick
## 6 capitulo       morty

g <- graph_from_data_frame(rels, directed = FALSE)
str <- strength(g)
m <- max(str)

g <- igraph::simplify(g, remove.multiple = TRUE, remove.loops = TRUE)

png(file = "/tmp/output.png", height = 5000, width = 5000)
plot(g, vertex.size = str * 15 / m, axes = TRUE, asp = 0)
dev.off()

## png 
##   2

# plot(g)
# tkplot(g, vertex.size = str * 15 / m, vertex.color = '#EFFD0E')