Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
library(tidyverse)
# Twitter authentication
create_token(
  app             = "Farizah Twitter Text Mining",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)
## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> Farizah Twitter Text Mining
##   key:    Qdg01ZfaYnNkmFeBYZ8ZYj45F
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
# Retrieve tweets bukalapak
tweets <- search_tweets("bukalapak", n = 30000, tweet_mode="extended")
## Searching for tweets...
## This may take a few seconds...
## Finished collecting tweets!
tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

## plot time series of tweets
ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of Bukalapak Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

Berdasarkan gambar “Frequency of Bukalapak Twitter Statuses from past 9 days” dapat diketahui bahwa dari rentang tanggal 2 November 2018 sampai 12 November 2018, frekuensi tweet terbanyak ada pada rentang tanggal 11 November 2018 hingga 12 November 2018 yaitu melebihi 125 tweet. Pada rentang tanggal 9 November 2018 hingga 10 November juga mencapai 125 tweet.

tail(tweets, 20)

Text Cleaning

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents
# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "bukalapak")
stopwords_id <- read.table('stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg", "ya", "yuk")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 14428, documents: 6025)>>
## Non-/sparse entries: 71689/86857011
## Sparsity           : 100%
## Maximal term length: 59
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
##  [1] "ceo"          "kasih"        "tau"          "bahan"       
##  [5] "ikuti"        "kali"         "kerja"        "november"    
##  [9] "pelapak"      "rumah"        "sukses"       "belanja"     
## [13] "d"            "lupa"         "malam"        "nih"         
## [17] "promo"        "beli"         "hadiah"       "klik"        
## [21] "meningkatkan" "paket"        "transaksi"    "aja"         
## [25] "besok"        "bingung"      "cashback"     "diskon"      
## [29] "gadget"       "langsung"     "loh"          "nggak"       
## [33] "nya"          "rp"           "tunggu"       "udah"        
## [37] "selamat"      "bukatalks"    "kota"         "membantu"    
## [41] "indonesia"    "salah"        "banget"       "deh"         
## [45] "gratis"       "menarik"      "super"        "hemat"       
## [49] "kebutuhan"    "mudah"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df, aes(x=reorder(term,freq), y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Gambar di atas merupakan bar chart untuk Bukalapak. Dapat diketahui bahwa kata terbanyak adalah bukabantuan, transaksi, tokopedia, mohon, shopee. Dari frekuensi terbanyak tersebut muncul brand e-commerce yang lain seperti tokopedia dan shopee, hal ini menandakan adanya persaingan antara beberapa e-commerce tersebut.

Wordcloud

Build Wordcloud

library(wordcloud)
## Loading required package: RColorBrewer
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
    random.order = F, colors = pal)

Gambar di atas merupakan wordcloud untuk bukalapak. Wordcloud dengan font terbesar menunjukkan kata terbanyak.