# Load packages
library(rtweet)
library(tidyverse)
# Twitter authentication
create_token(
app = "my_twitter_research_app",
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
## key: rAMI0T9FDtaH40A5PWPsk04GS
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
# Retrieve tweets
tweets <- search_tweets("#LionAir", n = 1000, tweet_mode="extended")
## Searching for tweets...
## Finished collecting tweets!
tweets <- distinct(tweets, text, .keep_all=TRUE)
## plot time series of tweets
ts_plot(tweets, "3 hours") +
theme_minimal() +
theme(plot.title = ggplot2::element_text(face = "bold")) +
labs(
x = NULL, y = NULL,
title = "Frequency of #LionAir Twitter statuses from past 4 days",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
tail(tweets, 20)
## # A tibble: 20 x 88
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 888477~ 10605712~ 2018-11-08 16:34:22 wikkieeee @Spr~ Twitt~
## 2 888477~ 10605709~ 2018-11-08 16:33:14 wikkieeee @tri~ Twitt~
## 3 618142~ 10605580~ 2018-11-08 15:42:08 JackAttack~ Lion~ Twitt~
## 4 269165~ 10605537~ 2018-11-08 15:24:51 pappu817 @pra~ Twitt~
## 5 336012~ 10605565~ 2018-11-08 15:35:53 s7u8h6ail "#Li~ Twitt~
## 6 782946~ 10605546~ 2018-11-08 15:28:27 rmolbabel Saya~ rmolb~
## 7 735387~ 10605541~ 2018-11-08 15:26:16 kekmoe1069 "#Li~ Twitt~
## 8 514058~ 10605504~ 2018-11-08 15:11:49 SuratAirpo~ In t~ Twitt~
## 9 285843~ 10605458~ 2018-11-08 14:53:20 PhotoTeleg~ A #L~ Twitt~
## 10 102100~ 10605441~ 2018-11-08 14:46:49 RajeevCell~ "Act~ Twitt~
## 11 100206~ 10605435~ 2018-11-08 14:44:24 lestaribun~ RS P~ "brow~
## 12 107423~ 10605435~ 2018-11-08 14:44:18 teklegion Lion~ Twitt~
## 13 105619~ 10605411~ 2018-11-08 14:34:41 AirwaysEnj~ #enj~ Twitt~
## 14 254620~ 10605362~ 2018-11-08 14:15:17 ernest_bru~ "#bo~ Faceb~
## 15 171231~ 10605335~ 2018-11-08 14:04:46 nancyrubin #Lio~ Twitt~
## 16 201645~ 10605335~ 2018-11-08 14:04:28 PurSpectiv~ #Lio~ Twitt~
## 17 989781~ 10605327~ 2018-11-08 14:01:32 laprensape~ "#In~ Twitt~
## 18 711760~ 10605326~ 2018-11-08 14:00:50 WIONews "#Li~ Tweet~
## 19 897475~ 10605322~ 2018-11-08 13:59:31 GautamHard~ "New~ Twitt~
## 20 518055~ 10605320~ 2018-11-08 13:58:36 realnatman Lion~ Twitt~
## # ... with 82 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, hashtags <list>,
## # symbols <list>, urls_url <list>, urls_t.co <list>,
## # urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## # media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## # ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>,
## # mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## # quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## # quoted_favorite_count <int>, quoted_retweet_count <int>,
## # quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## # quoted_followers_count <int>, quoted_friends_count <int>,
## # quoted_statuses_count <int>, quoted_location <chr>,
## # quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>,
## # retweet_created_at <dttm>, retweet_source <chr>,
## # retweet_favorite_count <int>, retweet_retweet_count <int>,
## # retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>,
## # country <chr>, country_code <chr>, geo_coords <list>,
## # coords_coords <list>, bbox_coords <list>, status_url <chr>,
## # name <chr>, location <chr>, description <chr>, url <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## # profile_expanded_url <chr>, account_lang <chr>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "indihome")
stopwords_id <- read.table('E:/stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 2346, documents: 414)>>
## Non-/sparse entries: 6139/965105
## Sparsity : 99%
## Maximal term length: 26
## Weighting : term frequency (tf)
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
## [1] "de" "indonesia" "jt" "lionair"
## [5] "air" "klikrmol" "lion" "korban"
## [9] "jenazah" "keluarga" "basarnas" "infojakarta"
## [13] "jakarta" "lionairjt" "flight" "boeing"
## [17] "max" "pilots" "plane" "crash"
## [21] "search" "victims" "lionaircrash" "airasia"
## [25] "citilink" "garuda" "jakpost" "sriwijayaair"
## [29] "pesawat" NA NA NA
## [33] NA NA NA NA
## [37] NA NA NA NA
## [41] NA NA NA NA
## [45] NA NA NA NA
## [49] NA NA
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 50)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df,aes(x=term, y=freq)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
library(wordcloud)
## Loading required package: RColorBrewer
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 10,
random.order = F, colors = pal)