library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
trump <- read.csv("C:/Users/Bacchus/Desktop/2018 spring/special topic for CS/week 3/Trump.csv")
washington<-subset(trump,USER_CITY == "WASHINGTON")
nyc<-subset(trump,USER_CITY == "New York City")
saveRDS(washington,"washington.RDS")
saveRDS(nyc, "nyc.RDS")
washington<-readRDS("washington.RDS")
nyc<-readRDS("nyc.RDS")
washington_tweets<-washington$MESSAGE_BODY
nyc_tweets<-nyc$MESSAGE_BODY
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}
washington_tweets = clean.text(washington_tweets)
nyc_tweets = clean.text(nyc_tweets)
corpus = Corpus(VectorSource(washington_tweets))
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)
## election trump gop debate primary just
## 48 13 10 6 3 3
## gopdebate lead policy amp point republican
## 3 3 3 3 3 3
## answer can new theyre michigan win
## 2 2 2 2 2 2
## beat amiright
## 2 2
corpus = Corpus(VectorSource(nyc_tweets))
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,20),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE, tolower = TRUE) )
tdm = as.matrix(tdm)
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
dm = data.frame(word=names(word_freqs), freq=word_freqs)
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

head(word_freqs, 20)
## trump election republicans
## 49 30 23
## electiontrumptrain new vote
## 22 22 22
## will video one
## 3 3 3
## think gop againtrump
## 3 2 2
## politicianshe smas tcot
## 2 2 2
## trumped racistmania donaldtrump
## 2 2 2
## skills cruzâs
## 2 2