This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Process data

#Import Data from zynga.RDS
zynga <- readRDS("C:/Users/Mounika/Zynga.RDS")
zyngaTweets <- zynga$text

#********************************************
#         Clean tweets
#********************************************
#use this function to clean the tweets
clean.text = function(x)
{
  
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # remove unicode 
  x = gsub("[^\x20-\x7E]", " ",x)
  return(x)
}

Create word cloud

zyngaTweets = clean.text(zyngaTweets)

#Create word cloud of tweets of zynga Users

corpus = Corpus(VectorSource(zyngaTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20), 
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#remove the top words which donât generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here â1â is 1st word in the list we want to remove 


#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 100), head(dm$freq, 100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

Midterm Exam

Process data

Create word cloud