##Install Packages
# install.packages("tm") # for text mining
# install.packages("wordcloud") # word-cloud generator
# install.packages("RColorBrewer") # color palettes
##Load Require Library
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
##Read the Data
tweetsDS <- readRDS("C:\\Users\\206429159\\Documents\\Rstudio\\Midterm\\zynga.rds")
##this should be - M:/S18/CS695/Midterm/Zynga.RDS the forward slashes not backward
tweets <- tweetsDS$text ###the text here is first column on which we should ideally work.. double click on your zynga dataset you will see the coulmn
# Function to clean tweets
clean.text = function(x)
{
# remove unicode
x = gsub("[^\x20-\x7E]", "",x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
x = tolower(x)
return(x)
}
# clean tweets
tweets = clean.text(tweets)
##Create word cloud of tweets of Zynga
corpus = Corpus(VectorSource(tweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE) )
# convert as matrix
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#remove the top words which donâ????t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)] #Here â????1â?? is 1st word in the list we want to remove
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 200), head(dm$freq, 200), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)
## looking can now prized adult petra
## 259 233 222 219 187 187
## play game trees jeneva found rewards
## 182 181 180 174 167 164
## points bit video sponsorship needing shook
## 144 140 139 139 138 138
## gotas rtherescar
## 137 137