title: “week3_homework” author: “Xuyang Wu” date: “February 7, 2018” output: html_document
## [1] "Using direct authentication"
tw = twitteR::searchTwitter('LIUBrooklyn', n = 1000, since = '2016-11-08', retryOnRateLimit = 1)
## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 1000 tweets were requested but the
## API can only return 170
liuBK = twitteR::twListToDF(tw)
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
saveRDS(liuBK, "liuBK.RDS")
liuBK <- readRDS("liuBK.RDS")
liuBKtweets <- liuBK$text
clean.text = function(x)
{
x = gsub("[^[:graph:]]", " ",x)
x = tolower(x)
x = gsub("rt", "", x)
x = gsub("@\\w+", "", x)
x = gsub("[[:punct:]]", "", x)
x = gsub("[[:digit:]]", "", x)
x = gsub("http\\w+", "", x)
x = gsub("[ |\t]{2,}", "", x)
x = gsub("^ ", "", x)
x = gsub(" $", "", x)
return(x)
}
liuBKtweets = clean.text(liuBKtweets)
corpus = Corpus(VectorSource(liuBKtweets))
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
word_freqs = word_freqs[-(1:9)]
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), scale=c(2, .9), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
head(word_freqs, 20)
## liubrooklyn blackbirdnation big
## 9 9 8
## day game francis
## 7 7 6
## liu onepack saint
## 5 5 5
## usteinberg wracvs committing
## 5 5 4
## congratulationson dewittlong forsarah
## 4 4 4
## island play confidentialpodcast
## 4 4 4
## episode found
## 4 4