Basic Text Mining

L. Marqués Padreny

Thursday, October 1, 2015

Packages Necesarios

library(SnowballC)  #stemming
library(qdap) #Analisis
library(qdapDictionaries)
library(dplyr) #Preparacion de datos
library(RColorBrewer)#paleta de colores
library(wordcloud)
library(ggplot2) #plots
library(scales) #incluye comas & numeros 
library(Rgraphviz) #plots correlacionados
library(twitteR) # Datos que trabajaremos tweets +/_ 200 @rdatammining
library(NLP)
library(tm) # analisis de texto

Fuentes y Formatos

Fuentes soportadas por el Package tm

getSources()
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

Formatos que soporta el Package tm.

getReaders()
##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"

Cargar datos “rdmtweets” del package Twitter, tuits @Rdatamining

Datos:

## Opción 1: se obtiene tweets de Twitter
#library(twitteR)
#tweets <- userTimeline("RDataMining", n = 3200)

## Opción 2:  
#AL final de la presentaciÓn está el link de los datos, hay tres conjuntos distintos para practicar.
rdmTweets <-rdmTweets
for (i in 1:10) {
     cat(paste("[[", i, "]] ", sep=""))
     writeLines(strwrap(rdmTweets[[i]]$getText(), width=73))  }
## [[1]] Postdoc/Research Scientist Position on Big Data at MIT
## http://t.co/hZ1ojAW2
## [[2]] Research scientist position for privacy-preserving data publishing,
## Singapore http://t.co/GPA0TyG5
## [[3]] Easier Parallel Computing in R with snowfall and sfCluster
## http://t.co/BPcinvzK
## [[4]] Tutorial: Parallel computing using R package snowfall
## http://t.co/CHBCyr76
## [[5]] handling big data: Interacting with Data using the filehash Package for
## R http://t.co/7RB3sChx
## [[6]] Parallel Computing with R using snow and snowfall http://t.co/nxp8EZpv
## [[7]] State of the Art in Parallel Computing with R http://t.co/zmClglqi
## [[8]] Slides on Parallel Computing in R http://t.co/AdDVxbOY
## [[9]] R with High Performance Computing: Parallel processing and large memory
## http://t.co/XZ3ZZBRF
## [[10]] High Performance Computing with R http://t.co/4fnGArSc

Data frame & Objeto de texto “Corpus”"

Transformar los tweets a data frame (package twitter)

df <- do.call("rbind", lapply(rdmTweets  ,as.data.frame))
dim(df)
## [1] 154  10

Crear nuestro objeto de texto: CORPUS (package tm)

myCorpus <- Corpus(VectorSource(df$text))
myCorpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 154

Transformaciones simples del Texto

La funcion tm_map() se utiliza para aplicar distintas transformaciones en todos los documentos dentro de un corpus para obtener un text “limpio”.

getTransformations()
## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

Descripción de transformaciones con tm_map()

Inspeccionar el texto Inicial (tweets) del 11 al 15

for (i in 11:15) {                                                      
    cat(paste("[[", i, "]] ", sep=""))                                      
    writeLines(strwrap(myCorpus[[i]], width=73))                            
  }
## [[11]] Slides on massive data, shared and distributed memory,and concurrent
## programming: bigmemory and foreach http://t.co/a6bQzxj5
## [[12]] The R Reference Card for Data Mining is updated with functions &
## packages for handling big data & parallel computing.
## http://t.co/FHoVZCyk
## [[13]] Post-doc on Optimizing a Cloud for Data Mining primitives, INRIA, France
## http://t.co/cA28STPO
## [[14]] Chief Scientist - Data Intensive Analytics, Pacific Northwest National
## Laboratory (PNNL), US http://t.co/0Gdzq1Nt
## [[15]] Top 10 in Data Mining http://t.co/7kAuNvuf
leertexto<-function(x) {
  for (i in 11:15) {                                                      
    cat(paste("[[", i, "]] ", sep=""))                                      
    writeLines(strwrap(myCorpus[[i]], width=73))                            
  } 
 
}

leertexto(mycorpus)

Las transfomaciones

Transformar mayúsculas a minúsculas

myCorpus <- tm_map(myCorpus, tolower)    

Quitar la puntuación

myCorpus <- tm_map(myCorpus, removePunctuation)           

Quitar los números, siempre y cuando sean irrelevantes para nuestro estudio

myCorpus <- tm_map(myCorpus, removeNumbers)               

Quitar las URLs

removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)  
myCorpus <- tm_map(myCorpus, removeURL)                   

Stopwords

length(stopwords("spanish"))
## [1] 308
length(stopwords("english"))
## [1] 174
stopwords("spanish")[1:10]
##  [1] "de"  "la"  "que" "el"  "en"  "y"   "a"   "los" "del" "se"
stopwords("spanish")[200:210]
##  [1] "hubiera"    "hubieras"   "hubiéramos" "hubierais"  "hubieran"  
##  [6] "hubiese"    "hubieses"   "hubiésemos" "hubieseis"  "hubiesen"  
## [11] "habiendo"
stopwords("english")[1:10]
##  [1] "i"         "me"        "my"        "myself"    "we"       
##  [6] "our"       "ours"      "ourselves" "you"       "your"

Tratar “Stop Words” propias

Añadir dos extra stopwords: “available” and “via”

myStopwords <- c(stopwords('english'), "available", "via")
length(myStopwords )
## [1] 176

Eliminar palabras del conjunto Stopwords

myStopwords <- setdiff(myStopwords, c("r", "big"))  
myStopwords <- setdiff(myStopwords, c("and", "until"))  
length(myStopwords )
## [1] 174
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)    

Eliminar espacios extra en blanco

myCorpus <- tm_map(myCorpus, stripWhitespace)

Resultados

inspect(myCorpus[11:15])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5
## 
## [[1]]
## [1] slides massive data shared and distributed memoryand concurrent programming bigmemory and foreach 
## 
## [[2]]
## [1]  r reference card data mining updated functions packages handling big data parallel computing 
## 
## [[3]]
## [1] postdoc optimizing cloud data mining primitives inria france 
## 
## [[4]]
## [1] chief scientist data intensive analytics pacific northwest national laboratory pnnl us 
## 
## [[5]]
## [1] top data mining

Ejemplo de Transformaciones específicas

Realizar algunas transformaciones concretas,
por ejemplo nombre de empresas, instituciones o texto muy evidente.

toString <- content_transformer(function(x, from, to) gsub(from, to, x))

docs <- tm_map(docs, toString, "Institute technology", " IT")
docs <- tm_map(docs, toString, "universitat politecnica catalunya", "UPC")
docs <- tm_map(docs, toString, "Universitat de Barcelona", "UB")  

Stemming Words

library(SnowballC)

Se utiliza un algoritmo que elimina terminaciones de las palabras comunes y nos quedamos con la raiz. La funcionalidad es proporcionada por wordStem () de SnowballC.

Ejemplo de esta funcion:

library(SnowballC)
getStemLanguages()#esta funcion soporta varios idiomas
##  [1] "danish"     "dutch"      "english"    "finnish"    "french"    
##  [6] "german"     "hungarian"  "italian"    "norwegian"  "porter"    
## [11] "portuguese" "romanian"   "russian"    "spanish"    "swedish"   
## [16] "turkish"
#wordStem(words, language = "en")
wordStem(c("win", "winning", "winner"))
## [1] "win"    "win"    "winner"

Stem words

myCorpusCopy <- myCorpus                                                
myCorpus <- tm_map(myCorpus, stemDocument) 

Inspeccionar el texto Final (tweets) del 11 al 15

for (i in 11:15) {                                                      
    cat(paste("[[", i, "]] ", sep=""))                                      
    writeLines(strwrap(myCorpus[[i]], width=73))                            
  }
## [[11]] slides massive data shared and distributed memoryand concurrent
## programming bigmemory and foreach
## [[12]] r reference card data mining updated functions packages handling big
## data parallel computing
## [[13]] postdoc optimizing cloud data mining primitives inria france
## [[14]] chief scientist data intensive analytics pacific northwest national
## laboratory pnnl us
## [[15]] top data mining

Crear Term-Document-Matrix

TermDocumentMatrix crea una tabla de contingencia con las palabras en filas y los documentos en columnas. DocumentTermMatrix crea la traspuesta.

myCorpus <- tm_map(myCorpus, PlainTextDocument)
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
myTdm
## <<TermDocumentMatrix (terms: 585, documents: 154)>>
## Non-/sparse entries: 1239/88851
## Sparsity           : 99%
## Maximal term length: 25
## Weighting          : term frequency (tf)
idx <- which(dimnames(myTdm)$Terms == "r")
inspect(myTdm[idx+(0:5),101:110])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 9/51
## Sparsity           : 85%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## 
##               Docs
## Terms          character(0) character(0) character(0) character(0)
##   r                       1            1            0            0
##   ramachandran            0            0            0            0
##   random                  0            0            0            0
##   ranked                  0            0            0            0
##   rapidminer              1            0            0            0
##   rdatamining             0            0            0            0
##               Docs
## Terms          character(0) character(0) character(0) character(0)
##   r                       2            0            0            1
##   ramachandran            0            0            0            0
##   random                  0            0            0            0
##   ranked                  0            0            0            0
##   rapidminer              0            0            0            0
##   rdatamining             0            0            0            1
##               Docs
## Terms          character(0) character(0)
##   r                       1            1
##   ramachandran            0            0
##   random                  0            0
##   ranked                  1            0
##   rapidminer              0            0
##   rdatamining             0            0
myTdm <- TermDocumentMatrix(myCorpus, control=list(minWordLength=1))
class(myTdm)
## [1] "TermDocumentMatrix"    "simple_triplet_matrix"
dim(myTdm)
## [1] 572 154

Frecuencias & Asociaciones

Encontrar las palabras más relevantes para nuestro estudio
 findFreqTerms(myTdm, lowfreq=10)
##  [1] "analysis"     "and"          "data"         "examples"    
##  [5] "introduction" "mining"       "network"      "package"     
##  [9] "research"     "slides"       "social"       "tutorial"    
## [13] "using"
 findFreqTerms(myTdm, lowfreq=20)
## [1] "analysis" "and"      "data"     "mining"

Columnas

mydtm=DocumentTermMatrix(myCorpus)
freqC <- colSums(as.matrix(mydtm))
length(freqC)
## [1] 572
ord <- order(freqC)
freqC[head(ord)]
##                   \034big                accessible 
##                         1                         1 
##                       acm addictedtorfreefrgraphiqu 
##                         1                         1 
##                  addition            administration 
##                         1                         1
freqC[tail(ord)]
##   slides  package analysis   mining      and     data 
##       16       17       22       50       54       63

Filas

termFrequency <- rowSums(as.matrix(myTdm))
length(termFrequency)
## [1] 572
termFrequency <- subset(termFrequency, termFrequency>=5)
length(termFrequency)
## [1] 42
head(termFrequency)
##   analysis        and  australia       card clustering       code 
##         22         54          5          5          8          7
head(table(termFrequency), 5)
## termFrequency
##  5  6  7  8  9 
## 15  5  3  5  1
tail(table(termFrequency), 5)
## termFrequency
## 17 22 50 54 63 
##  1  1  1  1  1

Plots

library(ggplot2) 
library(dplyr)

freq <- sort(rowSums(as.matrix(myTdm)), decreasing=TRUE)
head(freq, 14)
##         data          and       mining     analysis      package 
##           63           54           50           22           17 
##       slides     examples      network     tutorial     research 
##           16           15           15           13           12 
##       social introduction        using    computing 
##           12           10           10            9
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
##              word freq
## data         data   63
## and           and   54
## mining     mining   50
## analysis analysis   22
## package   package   17
## slides     slides   16

subset(wf, freq>5 ) %>% ggplot(aes(word, freq)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))

Barplot

barplot(termFrequency, las=2)

Identificar Asociaciones

Si dos palabras aparecen siempre juntas, entonces la correlación sería 1.0 y si nunca aparecen juntas la correlación sería 0.0. Así la correlación es una medida de cuan estrechamente asociado las palabras se encuentran en el Corpus.

Que palabras estan asociadas con “data”?

findAssocs(myTdm, 'data', 0.30)
##           data
## mining    0.52
## incl      0.34
## practice  0.34
## realworld 0.34
## small     0.34

Que palabras estan asociadas con “mining”?

findAssocs(myTdm, 'mining', 0.25)
##                mining
## data             0.52
## mahout           0.41
## recommendation   0.41
## sets             0.41
## supports         0.41
## frequent         0.36
## itemset          0.35
## card             0.30
## reference        0.30
## functions        0.28
## text             0.28
## classification   0.27
## lecture          0.27
## appl             0.25
## chapters         0.25
## lists            0.25
## useful           0.25
## want             0.25

plots de correlaciones

library(graph)
 library(Rgraphviz)
## Loading required package: grid
#plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=10)[1:5],corThreshold=0.25)
plot(myTdm,terms=findFreqTerms(myTdm, lowfreq=5)[1:30 ] ,corThreshold=0.25)

Wordcloud

Generar una nube de palabras para proporcionar un rápido resumen visual de la frecuencia de las palabras en un Corpus. 
El paquete wordcloud() Proporciona la función requerida.
El uso de set.seed() es para que podamos obtener el mismo diseño cada vez por lo contrario se elige un diseño al azar.
library(RColorBrewer)
library(wordcloud)

Se calcula la frecuencia de palabras y orden por frecuencia

m <- as.matrix(myTdm)
wordFreq <- sort(rowSums(m), decreasing=TRUE)

plot Wordcloud 1

set.seed(375) # to make it reproducible
grayLevels <- gray( (wordFreq+10) / (max(wordFreq)+10) )
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=3, random.order=F,colors=grayLevels)

plot Wordcloud 2

set.seed(142)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=50)

plot Wordcloud 3

set.seed(375)
wordcloud(words=names(wordFreq), freq=wordFreq,max.words=150, scale=c(6,.5), colors=brewer.pal(6,"Dark2"))

Clustering Words

Se quitan los términos sparse

myTdm2 <- removeSparseTerms(myTdm, sparse=0.95)
m2 <- as.matrix(myTdm2)

cluster terms

distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method="ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit)
#cortamos el arbol en 10 clusters
rect.hclust(fit, k=10)

(groups <- cutree(fit, k=10))
##     analysis          and    computing         data     examples 
##            1            2            3            4            5 
## introduction       mining      network      package     parallel 
##            3            6            1            7            3 
##     research       series       slides       social         time 
##            8            9           10            1            9 
##     tutorial        using 
##            7            3

Clustering Tweets con k-means algorithm

Se transpone la matriz para calcular los distintos clusters

m3 <- t(m2)

Fijar una semilla aleatoria

set.seed(122)

k-means clustering de tweets

k <- 8
kmeansResult <- kmeans(m3, k)

cluster centers

round(kmeansResult$centers, digits=3)
##   analysis   and computing  data examples introduction mining network
## 1    0.033 0.262     0.016 0.000    0.098        0.049  0.000   0.049
## 2    0.000 0.375     0.125 2.125    0.000        0.125  1.125   0.000
## 3    0.917 0.083     0.000 0.000    0.083        0.167  0.083   1.000
## 4    0.000 0.625     0.875 0.000    0.000        0.000  0.000   0.000
## 5    0.000 0.176     0.000 0.735    0.059        0.029  1.059   0.000
## 6    0.250 1.875     0.000 0.625    0.375        0.250  0.125   0.000
## 7    0.500 0.500     0.000 0.125    0.125        0.125  0.375   0.000
## 8    0.200 0.267     0.000 1.000    0.133        0.000  0.000   0.000
##   package parallel research series slides social time tutorial using
## 1   0.098    0.000    0.049      0  0.115  0.033    0    0.066 0.016
## 2   0.250    0.125    0.125      0  0.125  0.000    0    0.000 0.250
## 3   0.083    0.000    0.083      0  0.083  0.833    0    0.167 0.083
## 4   0.375    0.875    0.000      0  0.125  0.000    0    0.125 0.250
## 5   0.029    0.000    0.000      0  0.059  0.000    0    0.118 0.029
## 6   0.500    0.000    0.125      0  0.125  0.000    0    0.000 0.375
## 7   0.000    0.000    0.000      1  0.250  0.000    1    0.125 0.000
## 8   0.000    0.000    0.400      0  0.067  0.000    0    0.067 0.000
for (i in 1:k) {
    cat(paste("cluster ", i, ": ", sep=""))
    s <- sort(kmeansResult$centers[i,], decreasing=T)
    cat(names(s)[1:3], "\n")
    
}
## cluster 1: and slides examples 
## cluster 2: data mining and 
## cluster 3: network analysis social 
## cluster 4: computing parallel and 
## cluster 5: mining data and 
## cluster 6: and data package 
## cluster 7: series time analysis 
## cluster 8: data research and

Plot Cluster K-means

library(cluster)
d = dist(distMatrix , method="euclidian")
kfit = kmeans(d, 4)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

Clustering Tweets con “the k-medoids Algorithm”

library(fpc)
## Warning: package 'fpc' was built under R version 3.2.2
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "kfunction"; definition not
## updated

Particionar con k-medoids y estimar el número de clusters

pamResult <- pamk(m3, metric="manhattan")
pamResult <- pamk(m3, metric="euclidian")
#número de clusters identificados
(k <- pamResult$nc)
## [1] 10
pamResult <- pamResult$pamobject

#print cluster medoids

for (i in 1:k) {
    cat(paste("cluster", i, ": "))
    cat(colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)], "\n")
    # print tweets in cluster i
    # print(rdmTweets[pamResult$clustering==i])
} 
## cluster 1 : data 
## cluster 2 : and computing parallel 
## cluster 3 :  
## cluster 4 : and data mining 
## cluster 5 : data mining 
## cluster 6 : analysis network social 
## cluster 7 : and 
## cluster 8 : and package 
## cluster 9 : mining 
## cluster 10 : analysis and mining series time

plot cluster

layout(matrix(c(1,2),2,1)) # set to two graphs per page
plot(pamResult, color=F, labels=4, lines=0, cex=.8, col.clus=1,col.p=pamResult$clustering)

layout(matrix(1)) # change back to one graph per page

Cuantificar las palabras

library(qdap)

Resumir la lista de palabras.

words <- myTdm %>% as.matrix %>% rownames %>% (function(x) x[nchar(x) < 20])  
head(words, 15)                                                                                                       
##  [1] "\034big"        "access"         "accessible"     "acm"           
##  [5] "added"          "addition"       "administration" "advanced"      
##  [9] "afl"            "aggregating"    "ago"            "algorithms"    
## [13] "also"           "america"        "amounts"
summary(nchar(words))                                                                                                 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   7.000   7.098   9.000  19.000
table(nchar(words))                                                                                                   
## 
##  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 
## 44 74 68 69 77 76 56 45 30 10  5  9  4  1  2  1
dist_tab(nchar(words))                                                                                                
##    interval freq cum.freq percent cum.percent
## 1         3   44       44    7.71        7.71
## 2         4   74      118   12.96       20.67
## 3         5   68      186   11.91       32.57
## 4         6   69      255   12.08       44.66
## 5         7   77      332   13.49       58.14
## 6         8   76      408   13.31       71.45
## 7         9   56      464    9.81       81.26
## 8        10   45      509    7.88       89.14
## 9        11   30      539    5.25       94.40
## 10       12   10      549    1.75       96.15
## 11       13    5      554    0.88       97.02
## 12       14    9      563    1.58       98.60
## 13       15    4      567    0.70       99.30
## 14       16    1      568    0.18       99.47
## 15       17    2      570    0.35       99.82
## 16       19    1      571    0.18      100.00

Frecuencia de letras/palabras

data.frame(nletters=nchar(words)) %>% ggplot(aes(x=nletters)) + 
geom_histogram(binwidth=1) + 
geom_vline(xintercept=mean(nchar(words)),colour="green", size=1, alpha=.5) + 
labs(x="Number of Letters", y="Number of Words")

library(dplyr)                                  
library(stringr)     
## 
## Attaching package: 'stringr'
## 
## The following object is masked from 'package:qdap':
## 
##     %>%
## 
## The following object is masked from 'package:graph':
## 
##     boundary

Frecuencia de las distintas letras

words %>% str_split("") %>% sapply(function(x) x[-1]) %>%  unlist %>% dist_tab %>% mutate(Letter=factor(toupper(interval),levels=toupper(interval[order(freq)]))) %>%    
ggplot(aes(Letter, weight=percent)) +  geom_bar() + coord_flip() +   ylab("Proportion") +  scale_y_continuous(breaks=seq(0, 12, 2), label=function(x) paste0(x, "%"),expand=c(0,0), limits=c(0,12))                                     

Mapa calor Letras & Posición

words %>%   lapply(function(x) sapply(letters, gregexpr, x, fixed=TRUE)) %>%  unlist %>%                                                       
(function(x) x[x!=-1]) %>%   
(function(x) setNames(x, gsub("nnd", "", names(x)))) %>%   
(function(x) apply(table(data.frame(letter=toupper(names(x)), position=unname(x))), 1, function(y) y/length(x))) %>%   
qheat(high="green", low="yellow", by.column=NULL,  
      values=TRUE, digits=3, plot=FALSE) +
  ylab("Letter") +
  xlab("Position") +  
  theme(axis.text.x=element_text(angle=0)) + 
  guides(fill=guide_legend(title="Proportion"))     

Ejercicio

Links data:

https://drive.google.com/file/d/0B8gvT0gpUPyvV1NJeWFRUTVpSk0/view?usp=sharing https://drive.google.com/file/d/0B8gvT0gpUPyvclE4REhvQnZRemc/view?usp=sharing https://drive.google.com/file/d/0B8gvT0gpUPyvaFlRYmd4NEJ0Z1k/view?usp=sharing

#0.-datos 
install.packages("tm", "wordcloud", "twitter")
load("data")
# 1.-Crear Un data Frame
df <- do.call("rbind", lapply( "data"  ,as.data.frame))
dim(df) 
 #2.-Crear nuestro objeto de texto: CORPUS
myCorpus <- Corpus(VectorSource(df$text))
myCorpus
 #3.- Hacer alguna transformacion :
myCorpus <- tm_map(myCorpus, tolower) 
myCorpus <- tm_map(myCorpus, removePunctuation)                           
myCorpus <- tm_map(myCorpus, removeNumbers)               
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)  
myCorpus <- tm_map(myCorpus, removeURL)   
myCorpus <- tm_map(myCorpus, stripWhitespace)
#leer texto cada paso
 #4.-Crear Matriz
myCorpus <- tm_map(myCorpus, PlainTextDocument)
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
m <- as.matrix(myTdm)
 #5.-Crear un Wordplot 
wordFreq <- sort(rowSums(m), decreasing=TRUE)
set.seed(xxxxxx)
wordcloud(xxxxx)