This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
# Install necessary packages
# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
# library('tm')
# library('RColorBrewer')
# library('wordcloud')
# Import data from a csv file to data frame
trump <- read.csv("Trump.csv", comment.char="#")
tweets <- trump$MESSAGE_BODY
# Read data in a R data object
# BKdata <- readRDS("BKdata.rds")
# tweets <- BKdata$MESSAGE_BODY
# Function to clean tweets
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}
# clean tweets
tweets = clean.text(tweets)
#********************************************
# Word Cloud
#********************************************
corpus = Corpus(VectorSource(tweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
# convert as matrix
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#check top 50 most mentioned words
head(word_freqs, 20)
election trump gop vote
3448 1121 572 454
tcot will republicans new
272 244 224 210
donaldtrump electiontrumptrain gopdebate president
183 159 148 144
like amp get donald
140 138 130 125
politics one win take
124 124 122 111
#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1)] #Here “1” is 1st word in the list we want to remove
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I. When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).