Twitter Analysis

Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)

Dengan menggunakan packages rtweet, akan digunakan untuk mengumpulkan data dari twitter untuk dilakukan analisis lebih lanjut. Selanjutnya melakukan pendaftaran API twitter untuk mendapatkan akses.

Setelah mendapatkan token akses API twitter selanjutnya digunakan untuk mengakses data dalam Rmarkdown ini.

# Twitter authentication
create_token(
  app             = "my_twitter_research_app",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
##   key:    JgtAKEJUEsGq2B6h36G7bpRYm
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

# Retrieve tweets
tweets <- search_tweets("#HariAyah", n = 10000, langs="en", tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

Deskripsi tweet

## plot time series of tweets
ts_plot(tweets, "3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of #HariAyah Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

Pembersihan Data

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

Membangun Kumpulan Data

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents

# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents

# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents

# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents

# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))

tdm

## <<TermDocumentMatrix (terms: 2655, documents: 2274)>>
## Non-/sparse entries: 40843/5996627
## Sparsity           : 99%
## Maximal term length: 53
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)

freq.terms[1:50]

##  [1] "ayah"        "beliau"      "dgn"         "di"          "hari"       
##  [6] "hariayah"    "kepada"      "menulis"     "selamat"     "semua"      
## [11] "sudah"       "yg"          "berniat"     "cengeng"     "jika"       
## [16] "mendiang"    "mengingat"   "saya"        "selalu"      "soal"       
## [21] "tapi"        "urungkan"    "apa"         "asih"        "beli"       
## [26] "berlikuliku" "hidupku"     "ikan"        "ini"         "inspirasi"  
## [31] "jati"        "kalian"      "ke"          "kusampaikan" "lembah"     
## [36] "mau"         "melewati"    "menggunakan" "panggilan"   "pantun"     
## [41] "sampaikan"   "tahun"       "terimakasih" "ucapan"      "untuk"      
## [46] "wabah"       "yang"        "buatku"      "dan"         "disisimu"

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 1000)
df <- data.frame(term = names(term.freq), freq = term.freq)

ggplot2::ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)

## Loading required package: RColorBrewer

m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 300,
    random.order = F, colors = pal)

Twitter Analysis of #HariAyah

Affanda AHA (06211745000018)

12 November 2018