Tomado de:
Word cloud generator in R : One killer function to do everything you need - Easy Guides - Wiki - STHDA [WWW Document], n.d. URL http://www.sthda.com/english/wiki/word-cloud-generator-in-r-one-killer-function-to-do-everything-you-need (accessed 7.8.20).
Paquetes necesarios:
* tm
* SnowballC
* wordcloud
* RColorBrewer
* RCurl
* XML
## para instalar paquetes:
# install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer", "RCurl", "XML"))
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RCurl)
library(XML)
library(rsconnect)
## Warning: package 'rsconnect' was built under R version 4.0.2
# wordcloud function
rquery.wordcloud <- function(x, type=c("text", "url", "file"),
lang="english", excludeWords=NULL,
textStemming=FALSE, colorPalette="Dark2",
min.freq=3, max.words=200)
{
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
if(type[1]=="file") text <- readLines(x)
else if(type[1]=="url") text <- html_to_text(x)
else if(type[1]=="text") text <- x
# Load the text as a corpus
docs <- Corpus(VectorSource(text))
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove stopwords for the language
docs <- tm_map(docs, removeWords, stopwords(lang))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Remove your own stopwords
if(!is.null(excludeWords))
docs <- tm_map(docs, removeWords, excludeWords)
# Text stemming
if(textStemming) docs <- tm_map(docs, stemDocument)
# Create term-document matrix
tdm <- TermDocumentMatrix(docs)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
# check the color palette name
if(!colorPalette %in% rownames(brewer.pal.info)) colors = colorPalette
else colors = brewer.pal(8, colorPalette)
# Plot the word cloud
set.seed(1234)
wordcloud(d$word,d$freq, min.freq=min.freq, max.words=max.words,
random.order=FALSE, rot.per=0.35,
use.r.layout=FALSE, colors=colors)
invisible(list(tdm=tdm, freqTable = d))
}
# html_to_text function
html_to_text<-function(url){
library(RCurl)
library(XML)
# download html
html.doc <- getURL(url)
#convert to plain text
doc = htmlParse(html.doc, asText=TRUE)
# "//text()" returns all text outside of HTML tags.
# We also don’t want text such as style and script codes
text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
# Format text vector into one character string
return(paste(text, collapse = " "))
}
# desde un url
deurl <- "https://dsfernandez.github.io/bioestadisticas/modulos/modulod1.html"
# desde texto
depath <- "url o dirección del archivo"
# desde vector de texto
detexto <- c("./Biometria_encuesta_estadisticas-tu_20200815.txt")
## ejemplo de "wordcloud" de página de internet
res <- rquery.wordcloud(detexto, type ="file",
lang = "spanish",
excludeWords = NULL,
textStemming = FALSE,
colorPalette = "Dark2",
min.freq = 4,
max.words = 100)
x : character string (plain text, web URL, txt file path)
type : specify whether x is a plain text, a web page URL or a .txt file path
lang : the language of the text. This is important to be specified in order to remove the common stopwords (like ‘the’, ‘we’, ‘is’, ‘are’) from the text before further analysis. Supported languages are danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, russian, spanish and swedish.
excludeWords : a vector containing your own stopwords to be eliminated from the text. e.g : c(“word1”, “word2”)
textStemming : reduces words to their root form. Default value is FALSE. A stemming process reduces the words “moving” and “movement” to the root word, “move”.
colorPalette : Possible values are : a name of color palette taken from RColorBrewer package (e.g.: colorPalette = “Dark2”) color name (e.g. : colorPalette = “red”) a color code (e.g. : colorPalette = “#FF1245”)
min.freq : words with frequency below min.freq will not be plotted
max.words : maximum number of words to be plotted. least frequent terms dropped
A partir de la función rquery.wordcloud() se pueden obtener dos objetos:
* tdm : matriz de términos del documento
* freqTable : tabla de frecuencias
# tabla de frecuencias (primeras 20 más frecuentes)
TablaFrecuencia <- res$freqTable
head(TablaFrecuencia, 20)
## word freq
## estadísticas estadísticas 20
## datos datos 16
## vida vida 14
## estadisticas estadisticas 10
## estudios estudios 10
## manera manera 10
## pueden pueden 10
## ayuda ayuda 9
## ayudar ayudar 9
## poder poder 9
## tomar tomar 9
## cotidiana cotidiana 8
## decisiones decisiones 7
## ejemplo ejemplo 7
## mas mas 7
## entender entender 6
## forma forma 6
## futuro futuro 6
## importante importante 6
## ser ser 6
# gráfica de frecuencias (primeras 10 palabras)
barplot(TablaFrecuencia[1:10,]$freq, las = 2,
names.arg = TablaFrecuencia[1:10,]$word,
col ="lightblue", main ="Palabras más frecuentes",
ylab = "Frecuencia")
# palabras que aparecen con cierta frecuencia
tdm <- res$tdm
findFreqTerms(tdm, lowfreq = 4)
## [1] "ayuda" "ayudan" "ayudar" "cotidiana" "covid"
## [6] "datos" "debido" "decisiones" "ejemplo" "entender"
## [11] "estadisticas" "estadísticas" "estudios" "forma" "futuro"
## [16] "hora" "importante" "manera" "mas" "mejor"
## [21] "poder" "podría" "pueden" "resultados" "ser"
## [26] "tener" "tomar" "ver" "vida"
# relación entre una palabra y otras
findAssocs(tdm, terms = "upr", corlimit = 0.1)
## $upr
## numeric(0)