title: “week3_homework” author: “Xuyang Wu” date: “February 7, 2018” output: html_document

## [1] "Using direct authentication"
tw = twitteR::searchTwitter('LIUBrooklyn', n = 1000, since = '2016-11-08', retryOnRateLimit = 1)
## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 1000 tweets were requested but the
## API can only return 170
liuBK = twitteR::twListToDF(tw)
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
saveRDS(liuBK, "liuBK.RDS")
liuBK <- readRDS("liuBK.RDS")
liuBKtweets <- liuBK$text
clean.text = function(x)
{
  x = gsub("[^[:graph:]]", " ",x)
  x = tolower(x)
  x = gsub("rt", "", x)
  x = gsub("@\\w+", "", x)
  x = gsub("[[:punct:]]", "", x)
  x = gsub("[[:digit:]]", "", x)
  x = gsub("http\\w+", "", x)
  x = gsub("[ |\t]{2,}", "", x)
  x = gsub("^ ", "", x)
  x = gsub(" $", "", x)
  return(x)
}
liuBKtweets = clean.text(liuBKtweets)
corpus = Corpus(VectorSource(liuBKtweets))
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 
word_freqs = word_freqs[-(1:9)]
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), scale=c(2, .9), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)
##         liubrooklyn     blackbirdnation                 big 
##                   9                   9                   8 
##                 day                game             francis 
##                   7                   7                   6 
##                 liu             onepack               saint 
##                   5                   5                   5 
##          usteinberg              wracvs          committing 
##                   5                   5                   4 
##   congratulationson          dewittlong            forsarah 
##                   4                   4                   4 
##              island                play confidentialpodcast 
##                   4                   4                   4 
##             episode               found 
##                   4                   4