Trump Word Cloud

Initial Setup

Load Libraries

First load the libraries that will be used in this exercise

library(twitteR)
library(wordcloud)
library(tm)
library(XML)
library(RColorBrewer)
library(dplyr)

Connect to twitter API

Then use the twitteR package to send your credentials and create a session.
note: my personal information has been hidden here, you must enter your own
To get access to the twitter API you must sign up through there developer program. If you haven't done so already, please do so HERE

## [1] "Using direct authentication"
consumer_key <- "ENTER-CONSUMER-KEY"
consumer_secret <- "ENTER-CONSUMER-SECRET"
access_token <- "ENTER-ACCESS-TOKEN"
access_secret <- "ENTER-ACCESS-SECRET"
# setup OAuth between this app and you twitter app
setup_twitter_oauth(consumer_key,consumer_secret,access_token,access_secret)

Most Recent 100 Tweets World Cloud

Let's take a look at what we receive by setting parameters to retrieve the most recent 100 tweets.

# create rtwitter user for Trump by search twitter for user @realDonaldTrump
## Note: the '@' symbol has been removed
D.User <- getUser('realDonaldTrump')


# exclude replies so we only have his words
Don100 <- userTimeline(D.User, n = 100, excludeReplies = TRUE)
# create data frame from twitteR list
t100.frame <- twListToDF(Don100)

# create vector of words from column of sentences
# Start with most recent 100 collection
l100 <- strsplit(t100.frame$text, " ")
d100 <- data.frame(Words = unlist(l100))

# use tm package to convert our column of words into a corpus of text
tCorpus = Corpus(VectorSource(d100$Words))
# tm_map allows us to remove punctuation, numbers,
# make sure lower and upper case words are counted the same
# and perhaps most significant, remove stop words like the, and, but, etc
# otherwise our word cloud would be full of these words
# with how often they appear in languge
tCorpus = tm_map(tCorpus, content_transformer(tolower))
tCorpus = tm_map(tCorpus, removePunctuation)
tCorpus = tm_map(tCorpus, removeNumbers)
tCorpus = tm_map(tCorpus, removeWords, c(stopwords("SMART")))

myDTM = TermDocumentMatrix(tCorpus,
                           control = list(minWordLength = 1))
m <- as.matrix(myDTM)
v <- sort(rowSums(m), decreasing = TRUE)
wordcloud(names(v), v, scale = c(3,0.75), min.freq = 1,
              colors = brewer.pal(7, "Dark2"))

The complete collection of stopwords in the SMART list can be seen at the SMART information retrieval system. The SMART list is a reliable and fairly standard set of stopwords and is also used by the MC toolkit.

Using a table to make a word cloud

We can also create a word cloud from the table as shown here.
Using the most recent 100 tweets data frame, d100
# create table from d100 data frame, grouping by unique words
# while providing a new column for the tally of each
# finally sort the table by number of occurrences for each unique word
t100 <- d100 %>% group_by(Words) %>% tally(sort = TRUE)

wordcloud(words = t100$Words, freq = t100$n, scale = c(3,0.5),
          random.order = FALSE, min.freq = 2,
          colors = brewer.pal(6, "Reds"))

Pulling Max Trump Tweets

the twitteR package's userTimeline() function has a 3200 max n-number of returned tweets. This can be 3200 just from the user passed through, or if the userTimeline() function arguement is left to its default value of excludeReplies = FALSE then a max of 3200 tweets will be the combined most recent user object's tweets and replies to those tweets.

NOTE: while I requested 3200 tweets, I only received 718 on Feb 8, 2017. It appears to be common for Trump to delete his old tweets nearly everday Extra NOTE: You can pull tweets that have been deleted. Thus, Twitter keeps an accessable record of deleted tweets. Just an FYI, since I've pulled deleted tweets from Trump's profile before

## stopwords not working try manual
Don100 <- userTimeline(D.User, n = 100, excludeReplies = TRUE)

t100.frame <- twListToDF(Don100)

l100 <- strsplit(t100.frame$text, " ")
d100 <- data.frame(Words = unlist(l100))

tCorpus = Corpus(VectorSource(d100$Words))

tCorpus = tm_map(tCorpus, content_transformer(tolower))
tCorpus = tm_map(tCorpus, removePunctuation)
tCorpus = tm_map(tCorpus, removeNumbers)
tCorpus = tm_map(tCorpus, removeWords, c(stopwords("SMART")))

# grab max (n = 3200) number of Trumps tweets
Don3200 <- userTimeline(D.User, n = 3200, excludeReplies = TRUE)

# create R data frames from our twitter lists
t32k.frame <- twListToDF(Don3200)

l32 <- strsplit(t32k.frame$text, " ")
d32 <- data.frame(Words = unlist(l32))

t32Corpus = Corpus(VectorSource(d32$Words))
t32Corpus = tm_map(t32Corpus, content_transformer(tolower))
t32Corpus = tm_map(t32Corpus, removePunctuation)
t32Corpus = tm_map(t32Corpus, removeNumbers)
t32Corpus = tm_map(t32Corpus, removeWords, c(stopwords("SMART")))

# scrape 1000 most recent tweets
Don1k <- userTimeline(D.User, n = 1000, excludeReplies = TRUE)
# create R data frame from twitteR list object
t1k.frame <- twListToDF(Don1k)

l1 <- strsplit(t1k.frame$text, " ")
d1 <- data.frame(Words = unlist(l1))

t1Corpus = Corpus(VectorSource(d1$Words))
t1Corpus = tm_map(t1Corpus, content_transformer(tolower))
t1Corpus = tm_map(t1Corpus, removePunctuation)
t1Corpus = tm_map(t1Corpus, removeNumbers)
t1Corpus = tm_map(t1Corpus, removeWords, c(stopwords("SMART")))

Manual Stopwords

# Manual removal of stopwords
library(httr)
r <- read.table(url("http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a11-smart-stop-list/english.stop"))
stopWords <- r
vstop <- as.vector(stopWords)

# faster maual method
stpWrd <- stopwords("SMART")

lu100 <- unlist(l100)[!(unlist(l100) %in% stpWrd)]

tCorpus = Corpus(VectorSource(lu100))

tCorpus = tm_map(tCorpus, content_transformer(tolower))
tCorpus = tm_map(tCorpus, removePunctuation)
tCorpus = tm_map(tCorpus, removeNumbers)
tCorpus = tm_map(tCorpus, removeWords, c(stopwords("SMART")))

myDTM = TermDocumentMatrix(tCorpus,
                           control = list(minWordLength = 1))
m <- as.matrix(myDTM)
v <- sort(rowSums(m), decreasing = TRUE)
wordcloud(names(v), v, scale = c(2,0.25), min.freq = 2,
              colors = brewer.pal(5, "Dark2"))
D.User <- getUser('realDonaldTrump')
# grab max (n = 3200) number of Trumps tweets
Don3200 <- userTimeline(D.User, n = 3200, excludeReplies = TRUE)

# create R data frames from our twitter lists
t32k.frame <- twListToDF(Don3200)

l32 <- strsplit(t32k.frame$text, " ")
# faster maual method
stpWrd <- stopwords("SMART")

lu32 <- unlist(l32)[!(unlist(l32) %in% stpWrd)]

t32Corpus = Corpus(VectorSource(lu32))

#tolower is going slow
#t32Corpus = tm_map(t32Corpus, content_transformer(tolower))
t32Corpus = tm_map(t32Corpus, removePunctuation)
t32Corpus = tm_map(t32Corpus, removeNumbers)
# already removed stop words manually
#t32Corpus = tm_map(t32Corpus, removeWords, c(stopwords("SMART")))

myDTM = TermDocumentMatrix(t32Corpus,
                           control = list(minWordLength = 1))
m <- as.matrix(myDTM)
v <- sort(rowSums(m), decreasing = TRUE)
wordcloud(names(v), v, scale = c(2,0.25), min.freq = 20,
              colors = brewer.pal(8, "Dark2"))
myCorpus = Corpus(DataframeSource(d100))
myCorpus = tm_map(myCorpus, content_transformer(tolower))
myCorpus = tm_map(myCorpus, removePunctuation)
myCorpus = tm_map(myCorpus, removeNumbers)
myCorpus = tm_map(myCorpus, removeWords,
                  c(stopwords("SMART"),
                    "thy", "thou", "thee", "the", "and", "but"))

wordcloud(myCorpus, scale = c(3,0.5),
          random.order = FALSE, min.freq = 30,
          colors = brewer.pal(8, "Dark2"))