title: “CS695: Week 3 wordCloud Notebook” output: html_document df_print: paged

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages. Comment after installation

#install.packages('tm')
#install.packages('RColorBrewer')
#install.packages('wordcloud')
#install.packages("slam", type = "binary")

Include the packages.

library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
library('slam')

Process data

BreneBrownData <- readRDS("BreneBrown.RDS")
tweets <- BreneBrownData$text

# swap out all non-alphanumeric characters
# Note that the definition of what constitutes a letter or a number or a punctuatution mark varies slightly depending upon your locale, so you may need to experiment a little to get exactly what you want.
# str_replace_all(tweets, "[^[:alnum:]]", " ")
# iconv(tweets, from = 'UTF-8', to = 'ASCII//TRANSLIT')
# Encoding(tweets)  <- "UTF-8"

# Function to clean tweets
clean.text = function(x)
{
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
  # x = tolower(x)
  return(x)
}

# clean tweets
tweets = clean.text(tweets)

Create word cloud of tweets

corpus = Corpus(VectorSource(tweets))

# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
  # tolower may cause trouble on Window because UTF-8 encoding, changed to FALSE  
    tolower = FALSE) )

# convert as matrix. It may consume near 1g of your RAM
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#check top 50 most mentioned words
head(word_freqs, 50)
##          work         brave          This          easy          kind 
##           657           645           591           572           547 
##          week      practice       awkward          shit        RTLove 
##           546           543           542           539           536 
##          Stay           ton       profess         image          book 
##           532           532           530           523           185 
##           amp          love       courage           The           Its 
##           178           167           139           119           116 
##           one          read vulnerability        people          also 
##           115           106            99            95            93 
##    vulnerable          will         truth           can        change 
##            92            89            89            88            84 
##          need      choosing          Dare          like     imperfect 
##            83            82            82            81            80 
##         Brown         youre        doesnt          Lead         shame 
##            79            73            73            72            70 
##         right       amazing           RTI         Thank          know 
##            67            67            66            65            65 
##       reading          ever          time           hea          lead 
##            64            63            62            62            60
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:5)]  #Here “1:5” is 1st-5th words in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 50)
##          week      practice       awkward          shit        RTLove 
##           546           543           542           539           536 
##          Stay           ton       profess         image          book 
##           532           532           530           523           185 
##           amp          love       courage           The           Its 
##           178           167           139           119           116 
##           one          read vulnerability        people          also 
##           115           106            99            95            93 
##    vulnerable          will         truth           can        change 
##            92            89            89            88            84 
##          need      choosing          Dare          like     imperfect 
##            83            82            82            81            80 
##         Brown         youre        doesnt          Lead         shame 
##            79            73            73            72            70 
##         right       amazing           RTI         Thank          know 
##            67            67            66            65            65 
##       reading          ever          time           hea          lead 
##            64            63            62            62            60 
##          make          hard        afraid              ️           see 
##            59            59            59            58            58
# I see some words I don't know or understand, so I retrieve the tweets that have the words
# I retrieve all the tweets that have "nigeria" in it

index = grep("solutions", tweets)
tweets[index]
## character(0)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.