This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Collect tweets from Twitter API
#install.packages("ROAuth")
#install.packages("twitteR")
library("ROAuth")
library("twitteR")
# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
#*****************************
# Create your own Twitter key
# https://developer.twitter.com/en/docs/basics/getting-started#get-started-app
## [1] "Using direct authentication"
## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 1000 tweets were requested but the
## API can only return 144
# Function to clean tweets
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}
# clean tweets
liuBK = clean.text(liuBK)
Create word cloud of tweets
corpus = Corpus(VectorSource(liuBK))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
# convert as matrix
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
#remove the top words which donâ????t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)] #Here â????1â?? is 1st word in the list we want to remove
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 100), head(dm$freq, 100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 20)
## stud eduaubdedubuamark brooklyn
## 91 86 14
## clienta web center
## 12 12 10
## neduaubdedubud liubrooklyn androida
## 10 9 9
## big day steinberg
## 8 7 7
## francis game neduaubdedububa
## 6 6 5
## nufufefpm onepack saint
## 5 5 5
## wracnneduaubcedubfu litea
## 5 5