EXERCISE 2-TWITTER(PAHLAWAN)

Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
library(tidyverse)

# Twitter authentication
create_token(
  app             = "my_twitter_research_app",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
##   key:    DDzW10CxOVC1ST877s8ZbDJfq
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

# Retrieve tweets
tweets <- search_tweets("pahlawan", n = 10000, tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

## plot time series of tweets
ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of pahlawan Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

tail(tweets, 20)

## # A tibble: 20 x 88
##    user_id  status_id created_at          screen_name text          source
##    <chr>    <chr>     <dttm>              <chr>       <chr>         <chr> 
##  1 1703439~ 10616304~ 2018-11-11 14:43:21 goldenstar~ "@soeyoto1 H~ Twitt~
##  2 2322124~ 10616304~ 2018-11-11 14:43:15 syifaAdzki~ @setkabgoid ~ Twitt~
##  3 9656667~ 10616303~ 2018-11-11 14:42:55 bellaayund~ "@dhanytika ~ Twitt~
##  4 7864938~ 10616303~ 2018-11-11 14:42:52 andipadill~ @ayuning_2 S~ Twitt~
##  5 1283132~ 10616302~ 2018-11-11 14:42:24 Fathimahkh_ "Selamat ata~ Twitt~
##  6 1150254~ 10616301~ 2018-11-11 14:41:55 BEM_UB      "[Memperinga~ Insta~
##  7 8574246~ 10616300~ 2018-11-11 14:41:54 kodim0735   "Satgas TNI ~ Insta~
##  8 7610894~ 10616300~ 2018-11-11 14:41:49 yasa_negar~ @TitiekSoeha~ Twitt~
##  9 7610894~ 10616292~ 2018-11-11 14:38:30 yasa_negar~ @TitiekSoeha~ Twitt~
## 10 9139480~ 10616297~ 2018-11-11 14:40:27 RagilMSN71  "@soeyoto1 @~ Twitt~
## 11 9784926~ 10616295~ 2018-11-11 14:39:38 sandrafatm~ Pak de @joko~ Emosi~
## 12 59861674 10616295~ 2018-11-11 14:39:36 AjonWu      "Mari kita r~ Twitt~
## 13 59861674 10616295~ 2018-11-11 14:39:32 AjonWu      #SahabatDikb~ Twitt~
## 14 5924199~ 10616294~ 2018-11-11 14:39:25 JunaidiPil~ "@hputrasoeh~ Twitt~
## 15 5924199~ 10616293~ 2018-11-11 14:38:48 JunaidiPil~ "@hputrasoeh~ Twitt~
## 16 3591589~ 10616293~ 2018-11-11 14:38:53 irgarbiyan~ "Klo sekolah~ Twitt~
## 17 2123286~ 10616293~ 2018-11-11 14:38:47 ilmannafi_~ Hari Pahlawa~ Twitt~
## 18 2787538~ 10616293~ 2018-11-11 14:38:44 Pa_MILHAN   "Pahlawan it~ Twitt~
## 19 1547498~ 10616292~ 2018-11-11 14:38:31 madasarmad~ Yang tersisa~ Twitt~
## 20 9163350~ 10616292~ 2018-11-11 14:38:21 chaengkue   @KM_Shownu92~ Twitt~
## # ... with 82 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, hashtags <list>,
## #   symbols <list>, urls_url <list>, urls_t.co <list>,
## #   urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## #   media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## #   ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>,
## #   mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## #   quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## #   quoted_favorite_count <int>, quoted_retweet_count <int>,
## #   quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## #   quoted_followers_count <int>, quoted_friends_count <int>,
## #   quoted_statuses_count <int>, quoted_location <chr>,
## #   quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>,
## #   retweet_created_at <dttm>, retweet_source <chr>,
## #   retweet_favorite_count <int>, retweet_retweet_count <int>,
## #   retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>,
## #   country <chr>, country_code <chr>, geo_coords <list>,
## #   coords_coords <list>, bbox_coords <list>, status_url <chr>,
## #   name <chr>, location <chr>, description <chr>, url <chr>,
## #   protected <lgl>, followers_count <int>, friends_count <int>,
## #   listed_count <int>, statuses_count <int>, favourites_count <int>,
## #   account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## #   profile_expanded_url <chr>, account_lang <chr>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Text Cleaning

library(tm)

## Warning: package 'tm' was built under R version 3.5.1

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents

# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents

# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "pahlawan")
stopwords_id <- read.table('stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents

# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents

# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))

tdm

## <<TermDocumentMatrix (terms: 11975, documents: 4300)>>
## Non-/sparse entries: 46056/51446444
## Sparsity           : 100%
## Maximal term length: 50
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)

freq.terms[1:50]

##  [1] "bikin"        "bener"        "haripahlawan" "tau"         
##  [5] "berdiri"      "berjuang"     "mati"         "merdeka"     
##  [9] "pejuang"      "ajak"         "doa"          "indonesia"   
## [13] "masyarakat"   "jl"           "surabaya"     "video"       
## [17] "guru"         "jasa"         "joko"         "tanda"       
## [21] "meneruskan"   "menjaga"      "negara"       "perang"      
## [25] "selamat"      "semangat"     "tugas"        "bangsa"      
## [29] "keluarga"     "keras"        "lingkungan"   "maju"        
## [33] "membangun"    "utk"          "kali"         "memperingati"
## [37] "presiden"     "sih"          "aja"          "gak"         
## [41] "rela"         "tp"           "acara"        "lampung"     
## [45] "mengadakan"   "negeri"       "november"     "peringatan"  
## [49] "bupati"       "dg"

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)

ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.1

## Loading required package: RColorBrewer

m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
    random.order = F, colors = pal)

EXERCISE 2-TWITTER(PAHLAWAN)

Afifah Nur Iswari (06211540000109)

12 November 2018

Extracting Tweets

Retrieve tweets from Twitter

Tweets Description

Text Cleaning

Build corpus

Frequent Words

Build Term Document Matrix

Top Frequent Terms

Wordcloud

Build Wordcloud