Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
library(tidyverse)
# Twitter authentication
create_token(
  app             = "my_twitter_research_app",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)
<Token>
<oauth_endpoint>
 request:   https://api.twitter.com/oauth/request_token
 authorize: https://api.twitter.com/oauth/authenticate
 access:    https://api.twitter.com/oauth/access_token
<oauth_app> my_twitter_research_app
  key:    koZFJbYVk7gagYEGLaUN6RmTb
  secret: <hidden>
<credentials> oauth_token, oauth_token_secret
---
# Retrieve tweets
tweets <- search_tweets("indihome", n = 30000, tweet_mode="extended")
Searching for tweets...
This may take a few seconds...
Finished collecting tweets!
tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

## plot time series of tweets
ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of indihome Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

tail(tweets, 20)

Text Cleaning

library(tm)
Loading required package: NLP

Attaching package: ‘NLP’

The following object is masked from ‘package:ggplot2’:

    annotate

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
transformation drops documents
# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "indihome")
stopwords_id <- read.table('stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
transformation drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
<<TermDocumentMatrix (terms: 8298, documents: 3907)>>
Non-/sparse entries: 43440/32376846
Sparsity           : 100%
Maximal term length: 71
Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
 [1] "kabar"        "kak"          "layanan"      "selamat"      "senang"       "trims"        "cek"          "harga"       
 [9] "info"         "klik"         "paket"        "aktif"        "cp"           "dm"           "follow"       "infokan"     
[17] "nama"         "nomor"        "nya"          "pemilik"      "silakan"      "tunggu"       "ya"           "thanks"      
[25] "berlangganan" "aplikasi"     "fiber"        "jaringan"     "myindihome"   "pemasangan"   "tercover"     "app"         
[33] "channel"      "download"     "play"         "speed"        "tagihan"      "useetv"       "dibantu"      "kasih"       
[41] "seputar"      "terima"       "atm"          "detail"       "dicoba"       "konfirmasi"   "pembayaran"   "malam"       
[49] "perihal"      "produk"      
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)
Loading required package: RColorBrewer
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
    random.order = F, colors = pal)

LS0tCnRpdGxlOiAiVHdpdHRlciBBbmFseXNpcyIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHllcwogICAgdG9jX2Zsb2F0OiB0cnVlCi0tLQoKIyMgRXh0cmFjdGluZyBUd2VldHMKCiMjIyBSZXRyaWV2ZSB0d2VldHMgZnJvbSBUd2l0dGVyCgpgYGB7ciBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQojIExvYWQgcGFja2FnZXMKbGlicmFyeShydHdlZXQpCmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCmBgYHtyIGluY2x1ZGU9RkFMU0V9CiMgQWNjZXNzIHRva2VuIGFuZCBBUElzCmNvbnN1bWVyX2tleSAgICA8LSAia29aRkpiWVZrN2dhZ1lFR0xhVU42Um1UYiIKY29uc3VtZXJfc2VjcmV0IDwtICJUWDRiaWNGWXVLQkRrZkloTkNTS1RwYTlySlpDVWRMQmMwYjRzeXJtUGw4MUZubTdhaiIKYWNjZXNzX3Rva2VuICAgIDwtICI2Nzc3MzQwNi10U2RSZ0xlS2R6V2J6ZFVLajlCSnB1VUxXZHBYV2x2NGlHdWhEY2ttOCIKYWNjZXNzX3NlY3JldCAgIDwtICJleVB2NzFSbFB1YXo1cmNFaTdVQWRGeU9hTkdnTE54Um54cFppU0k0SjlIdHkiCmBgYAoKYGBge3J9CiMgVHdpdHRlciBhdXRoZW50aWNhdGlvbgpjcmVhdGVfdG9rZW4oCiAgYXBwICAgICAgICAgICAgID0gIm15X3R3aXR0ZXJfcmVzZWFyY2hfYXBwIiwKICBjb25zdW1lcl9rZXkgICAgPSBjb25zdW1lcl9rZXksCiAgY29uc3VtZXJfc2VjcmV0ID0gY29uc3VtZXJfc2VjcmV0LAogIGFjY2Vzc190b2tlbiAgICA9IGFjY2Vzc190b2tlbiwKICBhY2Nlc3Nfc2VjcmV0ICAgPSBhY2Nlc3Nfc2VjcmV0KQpgYGAKCmBgYHtyfQojIFJldHJpZXZlIHR3ZWV0cwp0d2VldHMgPC0gc2VhcmNoX3R3ZWV0cygiaW5kaWhvbWUiLCBuID0gMzAwMDAsIHR3ZWV0X21vZGU9ImV4dGVuZGVkIikKdHdlZXRzIDwtIGRpc3RpbmN0KHR3ZWV0cywgdGV4dCwgLmtlZXBfYWxsPVRSVUUpCmBgYAoKCiMjIyBUd2VldHMgRGVzY3JpcHRpb24KCmBgYHtyfQojIyBwbG90IHRpbWUgc2VyaWVzIG9mIHR3ZWV0cwp0c19wbG90KHR3ZWV0cywgIjMgaG91cnMiKSArCiAgdGhlbWVfbWluaW1hbCgpICsKICB0aGVtZShwbG90LnRpdGxlID0gZ2dwbG90Mjo6ZWxlbWVudF90ZXh0KGZhY2UgPSAiYm9sZCIpKSArCiAgbGFicygKICAgIHggPSBOVUxMLCB5ID0gTlVMTCwKICAgIHRpdGxlID0gIkZyZXF1ZW5jeSBvZiBpbmRpaG9tZSBUd2l0dGVyIHN0YXR1c2VzIGZyb20gcGFzdCA5IGRheXMiLAogICAgc3VidGl0bGUgPSAiVHdpdHRlciBzdGF0dXMgKHR3ZWV0KSBjb3VudHMgYWdncmVnYXRlZCB1c2luZyB0aHJlZS1ob3VyIGludGVydmFscyIsCiAgICBjYXB0aW9uID0gIlxuU291cmNlOiBEYXRhIGNvbGxlY3RlZCBmcm9tIFR3aXR0ZXIncyBSRVNUIEFQSSB2aWEgcnR3ZWV0IgogICkKYGBgCgpgYGB7cn0KdGFpbCh0d2VldHMsIDIwKQpgYGAKCgojIyBUZXh0IENsZWFuaW5nCgpgYGB7cn0KbGlicmFyeSh0bSkKYGBgCiMjIyBCdWlsZCBjb3JwdXMKCmBgYHtyfQojIGJ1aWxkIGEgY29ycHVzLCBhbmQgc3BlY2lmeSB0aGUgc291cmNlIHRvIGJlIGNoYXJhY3RlciB2ZWN0b3JzIApteUNvcnB1cyA8LSBDb3JwdXMoVmVjdG9yU291cmNlKHR3ZWV0cyR0ZXh0KSkKIyBjb252ZXJ0IHRvIGxvd2VyIGNhc2UKbXlDb3JwdXMgPC0gdG1fbWFwKG15Q29ycHVzLCBjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKQojIHJlbW92ZSBVUkxzCnJlbW92ZVVSTCA8LSBmdW5jdGlvbih4KSBnc3ViKCJodHRwW15bOnNwYWNlOl1dKiIsICIiLCB4KQpteUNvcnB1cyA8LSB0bV9tYXAobXlDb3JwdXMsIGNvbnRlbnRfdHJhbnNmb3JtZXIocmVtb3ZlVVJMKSkKIyByZW1vdmUgYW55dGhpbmcgb3RoZXIgdGhhbiBFbmdsaXNoIGxldHRlcnMgb3Igc3BhY2UgCnJlbW92ZU51bVB1bmN0IDwtIGZ1bmN0aW9uKHgpIGdzdWIoIlteWzphbHBoYTpdWzpzcGFjZTpdXSoiLCAiIiwgeCkgCm15Q29ycHVzIDwtIHRtX21hcChteUNvcnB1cywgY29udGVudF90cmFuc2Zvcm1lcihyZW1vdmVOdW1QdW5jdCkpCiMgcmVtb3ZlIHN0b3B3b3JkcwpteVN0b3B3b3JkcyA8LSBjKHNldGRpZmYoc3RvcHdvcmRzKCdlbmdsaXNoJyksIGMoInIiLCAiYmlnIikpLCAidXNlIiwgInNlZSIsICJ1c2VkIiwgInZpYSIsICJhbXAiLCAiaW5kaWhvbWUiKQpzdG9wd29yZHNfaWQgPC0gcmVhZC50YWJsZSgnc3RvcHdvcmRzLWlkLnR4dCcsIGhlYWRlciA9IEZBTFNFKQpteVN0b3B3b3JkcyA8LSBjKG15U3RvcHdvcmRzLCBhcy5tYXRyaXgoc3RvcHdvcmRzX2lkJFYxKSwgImhpIiwgInlnIikKbXlDb3JwdXMgPC0gdG1fbWFwKG15Q29ycHVzLCByZW1vdmVXb3JkcywgbXlTdG9wd29yZHMpCiMgcmVtb3ZlIGV4dHJhIHdoaXRlc3BhY2UKbXlDb3JwdXMgPC0gdG1fbWFwKG15Q29ycHVzLCBzdHJpcFdoaXRlc3BhY2UpCiMga2VlcCBhIGNvcHkgZm9yIHN0ZW0gY29tcGxldGlvbiBsYXRlcgpteUNvcnB1c0NvcHkgPC0gbXlDb3JwdXMKYGBgCiMjIEZyZXF1ZW50IFdvcmRzCgojIyMgQnVpbGQgVGVybSBEb2N1bWVudCBNYXRyaXgKYGBge3J9CnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgobXlDb3JwdXMsIGNvbnRyb2wgPSBsaXN0KHdvcmRMZW5ndGhzID0gYygxLCBJbmYpKSkKYGBgCgpgYGB7cn0KdGRtCmBgYAoKIyMjIFRvcCBGcmVxdWVudCBUZXJtcwoKYGBge3J9CmZyZXEudGVybXMgPC0gZmluZEZyZXFUZXJtcyh0ZG0sIGxvd2ZyZXEgPSAyMCkKYGBgCmBgYHtyfQpmcmVxLnRlcm1zWzE6NTBdCmBgYAoKYGBge3J9CnRlcm0uZnJlcSA8LSByb3dTdW1zKGFzLm1hdHJpeCh0ZG0pKQp0ZXJtLmZyZXEgPC0gc3Vic2V0KHRlcm0uZnJlcSwgdGVybS5mcmVxID49IDE1MCkKZGYgPC0gZGF0YS5mcmFtZSh0ZXJtID0gbmFtZXModGVybS5mcmVxKSwgZnJlcSA9IHRlcm0uZnJlcSkKYGBgCgpgYGB7cn0KZ2dwbG90KGRmLCBhZXMoeD10ZXJtLCB5PWZyZXEpKSArIGdlb21fYmFyKHN0YXQ9ImlkZW50aXR5IikgKwogIHhsYWIoIlRlcm1zIikgKyB5bGFiKCJDb3VudCIpICsgY29vcmRfZmxpcCgpICsKICB0aGVtZShheGlzLnRleHQ9ZWxlbWVudF90ZXh0KHNpemU9NykpCmBgYAoKIyMgV29yZGNsb3VkCgojIyMgQnVpbGQgV29yZGNsb3VkCmBgYHtyfQpsaWJyYXJ5KHdvcmRjbG91ZCkKYGBgCgpgYGB7cn0KbSA8LSBhcy5tYXRyaXgodGRtKQojIGNhbGN1bGF0ZSB0aGUgZnJlcXVlbmN5IG9mIHdvcmRzIGFuZCBzb3J0IGl0IGJ5IGZyZXF1ZW5jeSAKd29yZC5mcmVxIDwtIHNvcnQocm93U3VtcyhtKSwgZGVjcmVhc2luZyA9IFQpCiMgY29sb3JzCnBhbCA8LSBicmV3ZXIucGFsKDksICJCdUduIilbLSgxOjQpXQpgYGAKCgoKYGBge3J9CndvcmRjbG91ZCh3b3JkcyA9IG5hbWVzKHdvcmQuZnJlcSksIGZyZXEgPSB3b3JkLmZyZXEsIG1pbi5mcmVxID0gMTAwLAogICAgcmFuZG9tLm9yZGVyID0gRiwgY29sb3JzID0gcGFsKQpgYGAKCgo=