Oleh: Erna Dwi Dwi Nurindah Sari (06211745000034)

Pada tugas kali ini ingin diketahui hal yang banyak dicuitkan oleh para customer triindonesia di twitter.

library(rtweet)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.1     v stringr 1.3.0
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks rtweet::flatten()
## x dplyr::lag()     masks stats::lag()

Twitter authentication

create_token(
  app             = "ernadns",
  consumer_key    = "bXap9GY7zLRVjSlS3hJcGvlbS",
  consumer_secret = "3iYXYRnGm3GVicHoyOvvZW6u9rT7RawIhlSru0apYwX0wzGK9X",
  access_token    = "306680209-F8dXWbfLweE64m6imobyiA4SzN44FQMy7fTNGgb4",
  access_secret   = "Nl1LOKeNrbfmCDv39ogNFFuDKtfL2DDFZCqLOHTrDra2N")
## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> ernadns
##   key:    bXap9GY7zLRVjSlS3hJcGvlbS
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

Retrieve tweets

tweets <- search_tweets("triindonesia", n = 10000, tweet_mode="extended")
## Searching for tweets...
## Finished collecting tweets!
tweets <- distinct(tweets, text, .keep_all=TRUE)

Plot time series of tweets

ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of triindonesia Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

Berdasarkan hasil grafik diatas dapat diketahui bahwa mulai tanggal 3 November hingga 11 November orang banyak mencuitkan tentang triindonesia yakni pada tanggal 5 November.

tail(tweets, 20)
## # A tibble: 20 x 88
##    user_id status_id created_at          screen_name text  source
##    <chr>   <chr>     <dttm>              <chr>       <chr> <chr> 
##  1 533001~ 10583445~ 2018-11-02 13:06:14 DellayosyR  @tri~ Twitt~
##  2 533001~ 10583446~ 2018-11-02 13:06:36 DellayosyR  @tri~ Twitt~
##  3 980857~ 10583431~ 2018-11-02 13:00:33 ardimnida   Wakt~ Twitt~
##  4 103040~ 10583377~ 2018-11-02 12:39:26 MB_alfiant~ @tri~ Twitt~
##  5 103040~ 10583288~ 2018-11-02 12:03:44 MB_alfiant~ @tri~ Twitt~
##  6 153289~ 10583376~ 2018-11-02 12:39:02 cocholava   @tri~ Twitt~
##  7 958781~ 10583339~ 2018-11-02 12:24:16 rzpw15      @tri~ Twitt~
##  8 243116~ 10583082~ 2018-11-02 10:41:54 IamAffri    "@tr~ Twitt~
##  9 243116~ 10583304~ 2018-11-02 12:10:14 IamAffri    @tej~ Twitt~
## 10 442519~ 10583246~ 2018-11-02 11:47:17 boocinjaeh~ siny~ Twitt~
## 11 413314~ 10583239~ 2018-11-02 11:44:26 teje_sarwa~ "@Ia~ Twitt~
## 12 364753~ 10583217~ 2018-11-02 11:35:33 sincereloey "@tr~ Twitt~
## 13 814537~ 10583167~ 2018-11-02 11:15:48 agness_ky   @tri~ Twitt~
## 14 961187~ 10583155~ 2018-11-02 11:11:00 Dzeus_prin~ "Kuo~ Faceb~
## 15 125960~ 10583151~ 2018-11-02 11:09:22 defalpha    @tri~ Twitt~
## 16 103840~ 10583148~ 2018-11-02 11:08:12 tiara_f08   "Mak~ Twitt~
## 17 863078~ 10583117~ 2018-11-02 10:55:46 bluenisti_~ @tri~ Twitt~
## 18 967777~ 10583087~ 2018-11-02 10:44:03 NanangR1927 @tri~ Twitt~
## 19 108881~ 10583081~ 2018-11-02 10:41:37 destin_mah~ @tri~ Twitt~
## 20 536131~ 10582950~ 2018-11-02 09:49:32 Oktav_viani @tri~ Twitt~
## # ... with 82 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, hashtags <list>,
## #   symbols <list>, urls_url <list>, urls_t.co <list>,
## #   urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## #   media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## #   ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>,
## #   mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## #   quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## #   quoted_favorite_count <int>, quoted_retweet_count <int>,
## #   quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## #   quoted_followers_count <int>, quoted_friends_count <int>,
## #   quoted_statuses_count <int>, quoted_location <chr>,
## #   quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>,
## #   retweet_created_at <dttm>, retweet_source <chr>,
## #   retweet_favorite_count <int>, retweet_retweet_count <int>,
## #   retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>,
## #   country <chr>, country_code <chr>, geo_coords <list>,
## #   coords_coords <list>, bbox_coords <list>, status_url <chr>,
## #   name <chr>, location <chr>, description <chr>, url <chr>,
## #   protected <lgl>, followers_count <int>, friends_count <int>,
## #   listed_count <int>, statuses_count <int>, favourites_count <int>,
## #   account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## #   profile_expanded_url <chr>, account_lang <chr>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

Text Cleaning

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents

Remove anything other than English letters or space

removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents

Remove extra whitespace

myCorpus <- tm_map(myCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents

Keep a copy for stem completion later

myCorpusCopy <- myCorpus

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 6218, documents: 3405)>>
## Non-/sparse entries: 56288/21116002
## Sparsity           : 100%
## Maximal term length: 76
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
##  [1] "ada"          "admin"        "beli"         "bisa"        
##  [5] "di"           "kah"          "kak"          "triindonesia"
##  [9] "ya"           "yang"         "aktif"        "hari"        
## [13] "isi"          "masa"         "nih"          "ulang"       
## [17] "hilang"       "kalau"        "kenapa"       "mas"         
## [21] "sinyal"       "tri"          "apa"          "bgt"         
## [25] "dapet"        "dicek"        "ini"          "kuota"       
## [29] "mb"           "pas"          "saya"         "seperti"     
## [33] "sering"       "sms"          "tp"           "aktifkan"    
## [37] "lalu"         "pulsa"        "amp"          "ga"          
## [41] "kok"          "masuk"        "min"          "pagi"        
## [45] "sampe"        "sekarang"     "dari"         "dm"          
## [49] "ke"           "sama"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Berdasarkan pada gambar diatas dapat diketahui bahwa lima kata teratas yang sering dicuitkan oleh customer triindonesia adalah mengenai triindonesia, kak, ya, dm, dan kami. Hal tersebut terjadi karena pihak twitter triindonesia selalu membalas semua keluh kesah customer tri di twitter dengan sapaan kak, dan akan menyarankan customer untuk menyampaikan keluh kesah atau masalah melalui dm (direct message) agar privasinya lebih terjaga.

Wordcloud

Build Wordcloud

library(wordcloud)
## Loading required package: RColorBrewer
m <- as.matrix(tdm)

# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)

# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
    random.order = F, colors = pal)