Text Mining Twitter

Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
library(tidyverse)

# Twitter authentication
create_token(
  app             = "my_twitter_research_app",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
##   key:    rAMI0T9FDtaH40A5PWPsk04GS
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Retrieve tweets
tweets <- search_tweets("#LionAir", n = 1000, tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

## plot time series of tweets
ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of #LionAir Twitter statuses from past 4 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

tail(tweets, 20)

## # A tibble: 20 x 88
##    user_id status_id created_at          screen_name text  source
##    <chr>   <chr>     <dttm>              <chr>       <chr> <chr> 
##  1 888477~ 10605712~ 2018-11-08 16:34:22 wikkieeee   @Spr~ Twitt~
##  2 888477~ 10605709~ 2018-11-08 16:33:14 wikkieeee   @tri~ Twitt~
##  3 618142~ 10605580~ 2018-11-08 15:42:08 JackAttack~ Lion~ Twitt~
##  4 269165~ 10605537~ 2018-11-08 15:24:51 pappu817    @pra~ Twitt~
##  5 336012~ 10605565~ 2018-11-08 15:35:53 s7u8h6ail   "#Li~ Twitt~
##  6 782946~ 10605546~ 2018-11-08 15:28:27 rmolbabel   Saya~ rmolb~
##  7 735387~ 10605541~ 2018-11-08 15:26:16 kekmoe1069  "#Li~ Twitt~
##  8 514058~ 10605504~ 2018-11-08 15:11:49 SuratAirpo~ In t~ Twitt~
##  9 285843~ 10605458~ 2018-11-08 14:53:20 PhotoTeleg~ A #L~ Twitt~
## 10 102100~ 10605441~ 2018-11-08 14:46:49 RajeevCell~ "Act~ Twitt~
## 11 100206~ 10605435~ 2018-11-08 14:44:24 lestaribun~ RS P~ "brow~
## 12 107423~ 10605435~ 2018-11-08 14:44:18 teklegion   Lion~ Twitt~
## 13 105619~ 10605411~ 2018-11-08 14:34:41 AirwaysEnj~ #enj~ Twitt~
## 14 254620~ 10605362~ 2018-11-08 14:15:17 ernest_bru~ "#bo~ Faceb~
## 15 171231~ 10605335~ 2018-11-08 14:04:46 nancyrubin  #Lio~ Twitt~
## 16 201645~ 10605335~ 2018-11-08 14:04:28 PurSpectiv~ #Lio~ Twitt~
## 17 989781~ 10605327~ 2018-11-08 14:01:32 laprensape~ "#In~ Twitt~
## 18 711760~ 10605326~ 2018-11-08 14:00:50 WIONews     "#Li~ Tweet~
## 19 897475~ 10605322~ 2018-11-08 13:59:31 GautamHard~ "New~ Twitt~
## 20 518055~ 10605320~ 2018-11-08 13:58:36 realnatman  Lion~ Twitt~
## # ... with 82 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, hashtags <list>,
## #   symbols <list>, urls_url <list>, urls_t.co <list>,
## #   urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## #   media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## #   ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>,
## #   mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## #   quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## #   quoted_favorite_count <int>, quoted_retweet_count <int>,
## #   quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## #   quoted_followers_count <int>, quoted_friends_count <int>,
## #   quoted_statuses_count <int>, quoted_location <chr>,
## #   quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>,
## #   retweet_created_at <dttm>, retweet_source <chr>,
## #   retweet_favorite_count <int>, retweet_retweet_count <int>,
## #   retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>,
## #   country <chr>, country_code <chr>, geo_coords <list>,
## #   coords_coords <list>, bbox_coords <list>, status_url <chr>,
## #   name <chr>, location <chr>, description <chr>, url <chr>,
## #   protected <lgl>, followers_count <int>, friends_count <int>,
## #   listed_count <int>, statuses_count <int>, favourites_count <int>,
## #   account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## #   profile_expanded_url <chr>, account_lang <chr>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Text Cleaning

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents

# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents

# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "indihome")
stopwords_id <- read.table('E:/stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents

# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents

# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))

tdm

## <<TermDocumentMatrix (terms: 2346, documents: 414)>>
## Non-/sparse entries: 6139/965105
## Sparsity           : 99%
## Maximal term length: 26
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)

freq.terms[1:50]

##  [1] "de"           "indonesia"    "jt"           "lionair"     
##  [5] "air"          "klikrmol"     "lion"         "korban"      
##  [9] "jenazah"      "keluarga"     "basarnas"     "infojakarta" 
## [13] "jakarta"      "lionairjt"    "flight"       "boeing"      
## [17] "max"          "pilots"       "plane"        "crash"       
## [21] "search"       "victims"      "lionaircrash" "airasia"     
## [25] "citilink"     "garuda"       "jakpost"      "sriwijayaair"
## [29] "pesawat"      NA             NA             NA            
## [33] NA             NA             NA             NA            
## [37] NA             NA             NA             NA            
## [41] NA             NA             NA             NA            
## [45] NA             NA             NA             NA            
## [49] NA             NA

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 50)
df <- data.frame(term = names(term.freq), freq = term.freq)

ggplot(df,aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)

## Loading required package: RColorBrewer

m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 10,
    random.order = F, colors = pal)

Text Mining Twitter

Rossy Budhi Pratiwi

November 12, 2018

Extracting Tweets

Retrieve tweets from Twitter

Tweets Description

Text Cleaning

Build corpus

Frequent Words

Build Term Document Matrix

Top Frequent Terms

Wordcloud

Build Wordcloud