Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
# Twitter authentication
create_token(
  app             = "my_twitter_research_app",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)
<Token>
<oauth_endpoint>
 request:   https://api.twitter.com/oauth/request_token
 authorize: https://api.twitter.com/oauth/authenticate
 access:    https://api.twitter.com/oauth/access_token
<oauth_app> my_twitter_research_app
  key:    koZFJbYVk7gagYEGLaUN6RmTb
  secret: <hidden>
<credentials> oauth_token, oauth_token_secret
---
# Retrieve tweets
tweets <- search_tweets("#MachineLearning", n = 10000, langs="en", tweet_mode="extended")
Searching for tweets...
Finished collecting tweets!

Tweets Description

## plot time series of tweets
ts_plot(tweets, "3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of #MachineLearning Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

head(tweets)

Text Cleaning

library(tm)

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
transformation drops documents
# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
transformation drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
<<TermDocumentMatrix (terms: 8532, documents: 9737)>>
Non-/sparse entries: 175088/82900996
Sparsity           : 100%
Maximal term length: 36
Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
 [1] "ai"                     "algorithms"             "bigdata"                "datascience"            "deeplearning"          
 [6] "iot"                    "learning"               "machine"                "machinelearning"        "ml"                    
[11] "nlp"                    "read"                   "robots"                 "artificial"             "artificialintelligence"
[16] "causes"                 "conflict"               "decodes"                "intelligence"           "religious"             
[21] "system"                 "data"                   "global"                 "re"                     "reduce"                
[26] "risk"                   "boost"                  "million"                "science"                "approach"              
[31] "fear"                   "holds"                  "hope"                   "instead"                "promise"               
[36] "vast"                   "analytics"              "clinicalanalytics"      "decrease"               "healthtech"            
[41] "hospital"               "improve"                "increase"               "phuse"                  "physical"              
[46] "physicaltherapy"        "predictiveanalytics"    "readmissions"           "referrals"              "therapists"            
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 1000)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot2::ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 300,
    random.order = F, colors = pal)

LS0tCnRpdGxlOiAiVHdpdHRlciBBbmFseXNpcyIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB0cnVlCi0tLQoKIyMgRXh0cmFjdGluZyBUd2VldHMKCiMjIyBSZXRyaWV2ZSB0d2VldHMgZnJvbSBUd2l0dGVyCgpgYGB7cn0KIyBMb2FkIHBhY2thZ2VzCmxpYnJhcnkocnR3ZWV0KQpgYGAKCmBgYHtyIGluY2x1ZGU9RkFMU0V9CiMgQWNjZXNzIHRva2VuIGFuZCBBUElzCmNvbnN1bWVyX2tleSAgICA8LSAia29aRkpiWVZrN2dhZ1lFR0xhVU42Um1UYiIKY29uc3VtZXJfc2VjcmV0IDwtICJUWDRiaWNGWXVLQkRrZkloTkNTS1RwYTlySlpDVWRMQmMwYjRzeXJtUGw4MUZubTdhaiIKYWNjZXNzX3Rva2VuICAgIDwtICI2Nzc3MzQwNi10U2RSZ0xlS2R6V2J6ZFVLajlCSnB1VUxXZHBYV2x2NGlHdWhEY2ttOCIKYWNjZXNzX3NlY3JldCAgIDwtICJleVB2NzFSbFB1YXo1cmNFaTdVQWRGeU9hTkdnTE54Um54cFppU0k0SjlIdHkiCmBgYAoKYGBge3J9CiMgVHdpdHRlciBhdXRoZW50aWNhdGlvbgpjcmVhdGVfdG9rZW4oCiAgYXBwICAgICAgICAgICAgID0gIm15X3R3aXR0ZXJfcmVzZWFyY2hfYXBwIiwKICBjb25zdW1lcl9rZXkgICAgPSBjb25zdW1lcl9rZXksCiAgY29uc3VtZXJfc2VjcmV0ID0gY29uc3VtZXJfc2VjcmV0LAogIGFjY2Vzc190b2tlbiAgICA9IGFjY2Vzc190b2tlbiwKICBhY2Nlc3Nfc2VjcmV0ICAgPSBhY2Nlc3Nfc2VjcmV0KQpgYGAKCmBgYHtyfQojIFJldHJpZXZlIHR3ZWV0cwp0d2VldHMgPC0gc2VhcmNoX3R3ZWV0cygiI01hY2hpbmVMZWFybmluZyIsIG4gPSAxMDAwMCwgbGFuZ3M9ImVuIiwgdHdlZXRfbW9kZT0iZXh0ZW5kZWQiKQpgYGAKCiMjIyBUd2VldHMgRGVzY3JpcHRpb24KCmBgYHtyfQojIyBwbG90IHRpbWUgc2VyaWVzIG9mIHR3ZWV0cwp0c19wbG90KHR3ZWV0cywgIjMgaG91cnMiKSArCiAgZ2dwbG90Mjo6dGhlbWVfbWluaW1hbCgpICsKICBnZ3Bsb3QyOjp0aGVtZShwbG90LnRpdGxlID0gZ2dwbG90Mjo6ZWxlbWVudF90ZXh0KGZhY2UgPSAiYm9sZCIpKSArCiAgZ2dwbG90Mjo6bGFicygKICAgIHggPSBOVUxMLCB5ID0gTlVMTCwKICAgIHRpdGxlID0gIkZyZXF1ZW5jeSBvZiAjTWFjaGluZUxlYXJuaW5nIFR3aXR0ZXIgc3RhdHVzZXMgZnJvbSBwYXN0IDkgZGF5cyIsCiAgICBzdWJ0aXRsZSA9ICJUd2l0dGVyIHN0YXR1cyAodHdlZXQpIGNvdW50cyBhZ2dyZWdhdGVkIHVzaW5nIHRocmVlLWhvdXIgaW50ZXJ2YWxzIiwKICAgIGNhcHRpb24gPSAiXG5Tb3VyY2U6IERhdGEgY29sbGVjdGVkIGZyb20gVHdpdHRlcidzIFJFU1QgQVBJIHZpYSBydHdlZXQiCiAgKQpgYGAKCmBgYHtyfQpoZWFkKHR3ZWV0cykKYGBgCgoKIyMgVGV4dCBDbGVhbmluZwoKYGBge3J9CmxpYnJhcnkodG0pCmBgYAojIyMgQnVpbGQgY29ycHVzCgpgYGB7cn0KIyBidWlsZCBhIGNvcnB1cywgYW5kIHNwZWNpZnkgdGhlIHNvdXJjZSB0byBiZSBjaGFyYWN0ZXIgdmVjdG9ycyAKbXlDb3JwdXMgPC0gQ29ycHVzKFZlY3RvclNvdXJjZSh0d2VldHMkdGV4dCkpCiMgY29udmVydCB0byBsb3dlciBjYXNlCm15Q29ycHVzIDwtIHRtX21hcChteUNvcnB1cywgY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkKIyByZW1vdmUgVVJMcwpyZW1vdmVVUkwgPC0gZnVuY3Rpb24oeCkgZ3N1YigiaHR0cFteWzpzcGFjZTpdXSoiLCAiIiwgeCkKbXlDb3JwdXMgPC0gdG1fbWFwKG15Q29ycHVzLCBjb250ZW50X3RyYW5zZm9ybWVyKHJlbW92ZVVSTCkpCiMgcmVtb3ZlIGFueXRoaW5nIG90aGVyIHRoYW4gRW5nbGlzaCBsZXR0ZXJzIG9yIHNwYWNlIApyZW1vdmVOdW1QdW5jdCA8LSBmdW5jdGlvbih4KSBnc3ViKCJbXls6YWxwaGE6XVs6c3BhY2U6XV0qIiwgIiIsIHgpIApteUNvcnB1cyA8LSB0bV9tYXAobXlDb3JwdXMsIGNvbnRlbnRfdHJhbnNmb3JtZXIocmVtb3ZlTnVtUHVuY3QpKQojIHJlbW92ZSBzdG9wd29yZHMKbXlTdG9wd29yZHMgPC0gYyhzZXRkaWZmKHN0b3B3b3JkcygnZW5nbGlzaCcpLCBjKCJyIiwgImJpZyIpKSwgInVzZSIsICJzZWUiLCAidXNlZCIsICJ2aWEiLCAiYW1wIikKbXlDb3JwdXMgPC0gdG1fbWFwKG15Q29ycHVzLCByZW1vdmVXb3JkcywgbXlTdG9wd29yZHMpCiMgcmVtb3ZlIGV4dHJhIHdoaXRlc3BhY2UKbXlDb3JwdXMgPC0gdG1fbWFwKG15Q29ycHVzLCBzdHJpcFdoaXRlc3BhY2UpCiMga2VlcCBhIGNvcHkgZm9yIHN0ZW0gY29tcGxldGlvbiBsYXRlcgpteUNvcnB1c0NvcHkgPC0gbXlDb3JwdXMKYGBgCiMjIEZyZXF1ZW50IFdvcmRzCgojIyMgQnVpbGQgVGVybSBEb2N1bWVudCBNYXRyaXgKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobXlDb3JwdXMsIGNvbnRyb2wgPSBsaXN0KHdvcmRMZW5ndGhzID0gYygxLCBJbmYpKSkKYGBgCgpgYGB7cn0KdGRtCmBgYAoKIyMjIFRvcCBGcmVxdWVudCBUZXJtcwoKYGBge3J9CmZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSAyMCkKYGBgCmBgYHtyfQpmcmVxLnRlcm1zWzE6NTBdCmBgYAoKYGBge3J9CnRlcm0uZnJlcSA8LSByb3dTdW1zKGFzLm1hdHJpeCh0ZG0pKQp0ZXJtLmZyZXEgPC0gc3Vic2V0KHRlcm0uZnJlcSwgdGVybS5mcmVxID49IDEwMDApCmRmIDwtIGRhdGEuZnJhbWUodGVybSA9IG5hbWVzKHRlcm0uZnJlcSksIGZyZXEgPSB0ZXJtLmZyZXEpCmBgYAoKYGBge3J9CmdncGxvdDI6OmdncGxvdChkZiwgYWVzKHg9dGVybSwgeT1mcmVxKSkgKyBnZW9tX2JhcihzdGF0PSJpZGVudGl0eSIpICsKICB4bGFiKCJUZXJtcyIpICsgeWxhYigiQ291bnQiKSArIGNvb3JkX2ZsaXAoKSArCiAgdGhlbWUoYXhpcy50ZXh0PWVsZW1lbnRfdGV4dChzaXplPTcpKQpgYGAKCiMjIFdvcmRjbG91ZAoKIyMjIEJ1aWxkIFdvcmRjbG91ZApgYGB7cn0KbGlicmFyeSh3b3JkY2xvdWQpCmBgYAoKYGBge3J9Cm0gPC0gYXMubWF0cml4KHRkbSkKIyBjYWxjdWxhdGUgdGhlIGZyZXF1ZW5jeSBvZiB3b3JkcyBhbmQgc29ydCBpdCBieSBmcmVxdWVuY3kgCndvcmQuZnJlcSA8LSBzb3J0KHJvd1N1bXMobSksIGRlY3JlYXNpbmcgPSBUKQojIGNvbG9ycwpwYWwgPC0gYnJld2VyLnBhbCg5LCAiQnVHbiIpWy0oMTo0KV0KYGBgCgoKCmBgYHtyfQp3b3JkY2xvdWQod29yZHMgPSBuYW1lcyh3b3JkLmZyZXEpLCBmcmVxID0gd29yZC5mcmVxLCBtaW4uZnJlcSA9IDMwMCwKICAgIHJhbmRvbS5vcmRlciA9IEYsIGNvbG9ycyA9IHBhbCkKYGBgCgoK