Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
library(tidyverse)
# Twitter authentication
create_token(
  app             = "halumma",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)
## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> halumma
##   key:    OakS8ZIhrXK0l30ZZ9lBXlKxT
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Retrieve tweets
tweets <- search_tweets("#Bali", n = 1000, tweet_mode="extended")
## Searching for tweets...
## Finished collecting tweets!
tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

## plot time series of tweets
ts_plot(tweets, "10 minutes") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of #Bali Twitter statuses from past 6 hours",
    subtitle = "Twitter status (tweet) counts aggregated using 10 minutes",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

tail(tweets, 20)
## # A tibble: 20 x 88
##    user_id status_id created_at          screen_name text  source
##    <chr>   <chr>     <dttm>              <chr>       <chr> <chr> 
##  1 101634~ 10618376~ 2018-11-12 04:26:32 MellaBoBali "Mel~ Twitt~
##  2 141972~ 10618372~ 2018-11-12 04:25:10 ebook_trav~ "Ubu~ Faceb~
##  3 141972~ 10618372~ 2018-11-12 04:25:10 ebook_trav~ "Ubu~ Faceb~
##  4 981354~ 10618364~ 2018-11-12 04:21:57 Seiichiro4~ "<U+6628><U+65E5><U+306F>~ Twitt~
##  5 947191~ 10618361~ 2018-11-12 04:20:36 ANGELBIRD_1 "sab~ Twitt~
##  6 955372~ 10618336~ 2018-11-12 04:10:35 ChrisWill1~ Feel~ myCBot
##  7 296901~ 10618321~ 2018-11-12 04:04:51 BaliUltima~ "Rel~ Twitt~
##  8 296901~ 10618334~ 2018-11-12 04:10:00 BaliUltima~ "Sun~ Twitt~
##  9 100092~ 10618328~ 2018-11-12 04:07:38 Hoki3681    "M. ~ Twitt~
## 10 593933~ 10618327~ 2018-11-12 04:07:06 ThilanWije~ "#In~ Twitt~
## 11 100384~ 10618323~ 2018-11-12 04:05:26 infobolaho~ "M. ~ Twitt~
## 12 100102~ 10618319~ 2018-11-12 04:04:05 RHoki368    "M. ~ Twitt~
## 13 397588~ 10618316~ 2018-11-12 04:02:48 escort168   "Cek~ twitt~
## 14 729982~ 10618309~ 2018-11-12 04:00:07 karum_bali  "Wat~ Hoots~
## 15 729982~ 10618309~ 2018-11-12 04:00:06 karum_bali  "Wat~ Hoots~
## 16 718718~ 10618309~ 2018-11-12 04:00:06 kopinkue_b~ "Try~ Hoots~
## 17 957198~ 10618309~ 2018-11-12 04:00:03 prema_rasa  "Ter~ Buffer
## 18 558589~ 10618307~ 2018-11-12 03:59:12 iPadMNi     "A J~ Twitt~
## 19 155916~ 10618303~ 2018-11-12 03:57:40 rizalsulae~ "Bea~ Insta~
## 20 724414~ 10618299~ 2018-11-12 03:56:04 xpbali      "Lux~ Twitt~
## # ... with 82 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, hashtags <list>,
## #   symbols <list>, urls_url <list>, urls_t.co <list>,
## #   urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## #   media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## #   ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>,
## #   mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## #   quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## #   quoted_favorite_count <int>, quoted_retweet_count <int>,
## #   quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## #   quoted_followers_count <int>, quoted_friends_count <int>,
## #   quoted_statuses_count <int>, quoted_location <chr>,
## #   quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>,
## #   retweet_created_at <dttm>, retweet_source <chr>,
## #   retweet_favorite_count <int>, retweet_retweet_count <int>,
## #   retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>,
## #   country <chr>, country_code <chr>, geo_coords <list>,
## #   coords_coords <list>, bbox_coords <list>, status_url <chr>,
## #   name <chr>, location <chr>, description <chr>, url <chr>,
## #   protected <lgl>, followers_count <int>, friends_count <int>,
## #   listed_count <int>, statuses_count <int>, favourites_count <int>,
## #   account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## #   profile_expanded_url <chr>, account_lang <chr>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Text Cleaning

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents
# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "indihome")
stopwords_id <- read.table('E:/stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 2983, documents: 414)>>
## Non-/sparse entries: 6257/1228705
## Sparsity           : 99%
## Maximal term length: 40
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
##  [1] "bali"         "bandung"      "surabaya"     "travel"      
##  [5] "jakarta"      "kuta"         "photo"        "ubud"        
##  [9] "beauty"       "courtesy"     "gedeprama"    "happy"       
## [13] "healthy"      "holy"         "innerharmony" "peace"       
## [17] "pinterest"    "indonesia"    "beach"        "wa"          
## [21] NA             NA             NA             NA            
## [25] NA             NA             NA             NA            
## [29] NA             NA             NA             NA            
## [33] NA             NA             NA             NA            
## [37] NA             NA             NA             NA            
## [41] NA             NA             NA             NA            
## [45] NA             NA             NA             NA            
## [49] NA             NA
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 50)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df,aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)
## Loading required package: RColorBrewer
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 5,
    random.order = F, colors = pal)