Nube de Palabras

Gráfica “word cloud” a partir de texto

Tomado de:

Word cloud generator in R : One killer function to do everything you need - Easy Guides - Wiki - STHDA [WWW Document], n.d. URL http://www.sthda.com/english/wiki/word-cloud-generator-in-r-one-killer-function-to-do-everything-you-need (accessed 7.8.20).

Paquetes necesarios:
* tm
* SnowballC
* wordcloud
* RColorBrewer
* RCurl
* XML

## para instalar paquetes:
# install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer", "RCurl", "XML"))
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RCurl)
library(XML)
library(rsconnect)

## Warning: package 'rsconnect' was built under R version 4.0.2

Funciones necesarias para crear el “word cloud”

# wordcloud function
rquery.wordcloud <- function(x, type=c("text", "url", "file"), 
                          lang="english", excludeWords=NULL, 
                          textStemming=FALSE, colorPalette="Dark2",
                          min.freq=3, max.words=200)
{ 
  library("tm")
  library("SnowballC")
  library("wordcloud")
  library("RColorBrewer") 
  
  if(type[1]=="file") text <- readLines(x)
  else if(type[1]=="url") text <- html_to_text(x)
  else if(type[1]=="text") text <- x
  
  # Load the text as a corpus
  docs <- Corpus(VectorSource(text))
  # Convert the text to lower case
  docs <- tm_map(docs, content_transformer(tolower))
  # Remove numbers
  docs <- tm_map(docs, removeNumbers)
  # Remove stopwords for the language 
  docs <- tm_map(docs, removeWords, stopwords(lang))
  # Remove punctuations
  docs <- tm_map(docs, removePunctuation)
  # Eliminate extra white spaces
  docs <- tm_map(docs, stripWhitespace)
  # Remove your own stopwords
  if(!is.null(excludeWords)) 
    docs <- tm_map(docs, removeWords, excludeWords) 
  # Text stemming
  if(textStemming) docs <- tm_map(docs, stemDocument)
  # Create term-document matrix
  tdm <- TermDocumentMatrix(docs)
  m <- as.matrix(tdm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  # check the color palette name 
  if(!colorPalette %in% rownames(brewer.pal.info)) colors = colorPalette
  else colors = brewer.pal(8, colorPalette) 
  # Plot the word cloud
  set.seed(1234)
  wordcloud(d$word,d$freq, min.freq=min.freq, max.words=max.words,
            random.order=FALSE, rot.per=0.35, 
            use.r.layout=FALSE, colors=colors)
  
  invisible(list(tdm=tdm, freqTable = d))
}
# html_to_text function
html_to_text<-function(url){
  library(RCurl)
  library(XML)
  # download html
  html.doc <- getURL(url)  
  #convert to plain text
  doc = htmlParse(html.doc, asText=TRUE)
 # "//text()" returns all text outside of HTML tags.
 # We also don’t want text such as style and script codes
  text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
  # Format text vector into one character string
  return(paste(text, collapse = " "))
}

Nube de palabras a partir de tres fuentes

# desde un url
deurl <- "https://dsfernandez.github.io/bioestadisticas/modulos/modulod1.html"
# desde texto
depath <- "url o dirección del archivo"
# desde vector de texto
detexto <- c("./Biometria_encuesta_estadisticas-tu_20200815.txt")
## ejemplo de "wordcloud" de página de internet
res <- rquery.wordcloud(detexto, type ="file",
                        lang = "spanish",
                        excludeWords = NULL,
                        textStemming = FALSE,
                        colorPalette = "Dark2",
                        min.freq = 4,
                        max.words = 100)

Parámetros de la función

x : character string (plain text, web URL, txt file path)
type : specify whether x is a plain text, a web page URL or a .txt file path
lang : the language of the text. This is important to be specified in order to remove the common stopwords (like ‘the’, ‘we’, ‘is’, ‘are’) from the text before further analysis. Supported languages are danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, russian, spanish and swedish.
excludeWords : a vector containing your own stopwords to be eliminated from the text. e.g : c(“word1”, “word2”)
textStemming : reduces words to their root form. Default value is FALSE. A stemming process reduces the words “moving” and “movement” to the root word, “move”.
colorPalette : Possible values are : a name of color palette taken from RColorBrewer package (e.g.: colorPalette = “Dark2”) color name (e.g. : colorPalette = “red”) a color code (e.g. : colorPalette = “#FF1245”)
min.freq : words with frequency below min.freq will not be plotted
max.words : maximum number of words to be plotted. least frequent terms dropped

Análisis del texto

A partir de la función rquery.wordcloud() se pueden obtener dos objetos:
* tdm : matriz de términos del documento
* freqTable : tabla de frecuencias

Tabla y gráfica de frecuencias

# tabla de frecuencias (primeras 20 más frecuentes)
TablaFrecuencia <- res$freqTable
head(TablaFrecuencia, 20)

##                      word freq
## estadísticas estadísticas   20
## datos               datos   16
## vida                 vida   14
## estadisticas estadisticas   10
## estudios         estudios   10
## manera             manera   10
## pueden             pueden   10
## ayuda               ayuda    9
## ayudar             ayudar    9
## poder               poder    9
## tomar               tomar    9
## cotidiana       cotidiana    8
## decisiones     decisiones    7
## ejemplo           ejemplo    7
## mas                   mas    7
## entender         entender    6
## forma               forma    6
## futuro             futuro    6
## importante     importante    6
## ser                   ser    6

# gráfica de frecuencias (primeras 10 palabras)
barplot(TablaFrecuencia[1:10,]$freq, las = 2, 
        names.arg = TablaFrecuencia[1:10,]$word,
        col ="lightblue", main ="Palabras más frecuentes",
        ylab = "Frecuencia")

Selección de palabras con cierta frecuencia

# palabras que aparecen con cierta frecuencia
tdm <- res$tdm
findFreqTerms(tdm, lowfreq = 4)

##  [1] "ayuda"        "ayudan"       "ayudar"       "cotidiana"    "covid"       
##  [6] "datos"        "debido"       "decisiones"   "ejemplo"      "entender"    
## [11] "estadisticas" "estadísticas" "estudios"     "forma"        "futuro"      
## [16] "hora"         "importante"   "manera"       "mas"          "mejor"       
## [21] "poder"        "podría"       "pueden"       "resultados"   "ser"         
## [26] "tener"        "tomar"        "ver"          "vida"

# relación entre una palabra y otras
findAssocs(tdm, terms = "upr", corlimit = 0.1)

## $upr
## numeric(0)