## Loading required package: NLP
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
- Obtenemos tweets
tweets_collection <- mongo("tweets_mongo_covid19", url = params$con)
all <- tweets_collection$find(query = '{}')
- Limpiamos tweets
stops <- tm::stopwords("spanish")
limpiar_tokenizar <- function(texto){
nuevo_texto <- tolower(texto)
nuevo_texto <- str_replace_all(nuevo_texto,"http\\S*", "")
nuevo_texto <- str_replace_all(nuevo_texto,"[[:punct:]]", " ")
nuevo_texto <- str_replace_all(nuevo_texto,"[[:digit:]]", " ")
nuevo_texto <- str_replace_all(nuevo_texto,"[\\s]+", " ")
nuevo_texto <- str_split(nuevo_texto, " ")[[1]]
nuevo_texto <- keep(
.x = nuevo_texto,
.p = function(x){
str_length(x) > 1 && ! (x %in% stops)
}
)
return(nuevo_texto)
}
data <- head(all, params$limit)
tweets <- data %>%
mutate(text_token = map(
.x = text,
.f = limpiar_tokenizar
))
head(tweets$text_token)
## [[1]]
## [1] "bueno" "capitulo" "rick" "morty" "coronavirus"
##
## [[2]]
## [1] "aquÃ" "bonito" "mapa" "estadÃsticas" "avance"
## [6] "covid" "méxico" "covid" "corona" "virus"
## [11] "mexico" "covid" "cuarentena" "coronavirus" "covidmx"
##
## [[3]]
## [1] "willaxtv" "milagrosleivag" "huacchillo" "preguntan"
## [5] "cuanto" "ejecutado" "presupuesto" "regional"
## [9] "cuanto" "invierte" "salud" "lado"
## [13] "gobierno" "regional" "dijeron" "población"
## [17] "seguÃa" "haciendo" "daba" "gana"
## [21] "dÃa" "cuarentena" "medidas" "tomaron"
##
## [[4]]
## [1] "salir" "fiesta" "cuarentena" "nacos"
##
## [[5]]
## [1] "dios" "odio" "mundo" "quiero" "pasar"
## [6] "cuarentena" "solo"
##
## [[6]]
## [1] "unidos" "dio" "autorización" "usar" "remdesivir"
## [6] "tratar" "pacientes" "coronavirus"
- Construimos vertices (nodos) y aristas (relaciones)
from <- c();
to <- c();
# print(nrow(tweets))
for (row in 1:nrow(tweets)) {
# if (row %% 100 == 0) { print(row) }
words <- tweets$text_token[[row]]
l = length(words)
if (l > 1) {
end <- l - 1
for (i in 1:end) {
start <- i + 1
word <- words[i]
for (j in start:l) {
current = words[j]
if (word != current) {
from <- c(from, word)
to <- c(to, current)
}
}
}
}
}
rels <- data.frame(from = from, to = to)
head(rels)
## from to
## 1 bueno capitulo
## 2 bueno rick
## 3 bueno morty
## 4 bueno coronavirus
## 5 capitulo rick
## 6 capitulo morty
g <- graph_from_data_frame(rels, directed = FALSE)
str <- strength(g)
m <- max(str)
g <- igraph::simplify(g, remove.multiple = TRUE, remove.loops = TRUE)
png(file = "/tmp/output.png", height = 5000, width = 5000)
plot(g, vertex.size = str * 15 / m, axes = TRUE, asp = 0)
dev.off()
## png
## 2
# plot(g)
# tkplot(g, vertex.size = str * 15 / m, vertex.color = '#EFFD0E')
