library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
trump <- read.csv("C:/Users/Bacchus/Desktop/2018 spring/special topic for CS/week 3/Trump.csv")
washington<-subset(trump,USER_CITY == "WASHINGTON")
nyc<-subset(trump,USER_CITY == "New York City")
saveRDS(washington,"washington.RDS")
saveRDS(nyc, "nyc.RDS")
washington<-readRDS("washington.RDS")
nyc<-readRDS("nyc.RDS")
washington_tweets<-washington$MESSAGE_BODY
nyc_tweets<-nyc$MESSAGE_BODY
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  return(x)
}
washington_tweets = clean.text(washington_tweets)
nyc_tweets = clean.text(nyc_tweets)

corpus = Corpus(VectorSource(washington_tweets))

tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )


tdm = as.matrix(tdm)


word_freqs = sort(rowSums(tdm), decreasing=TRUE) 


dm = data.frame(word=names(word_freqs), freq=word_freqs)

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)
##   election      trump        gop     debate    primary       just 
##         48         13         10          6          3          3 
##  gopdebate       lead     policy        amp      point republican 
##          3          3          3          3          3          3 
##     answer        can        new     theyre   michigan        win 
##          2          2          2          2          2          2 
##       beat   amiright 
##          2          2
corpus = Corpus(VectorSource(nyc_tweets))

tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )


tdm = as.matrix(tdm)


word_freqs = sort(rowSums(tdm), decreasing=TRUE) 


dm = data.frame(word=names(word_freqs), freq=word_freqs)

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)
##              trump           election        republicans 
##                 49                 30                 23 
## electiontrumptrain                new               vote 
##                 22                 22                 22 
##               will              video                one 
##                  3                  3                  3 
##              think                gop         againtrump 
##                  3                  2                  2 
##      politicianshe               smas               tcot 
##                  2                  2                  2 
##            trumped        racistmania        donaldtrump 
##                  2                  2                  2 
##             skills           cruz’s 
##                  2                  2