======================================================================================

TELKOMSEL

Extracting Tweets

Retrieve tweets from Twitter

## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.7
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0

## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks rtweet::flatten()
## x dplyr::lag()     masks stats::lag()

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
##   key:    sxxdmMv0ceEXTFN0ZlqsdTcdu
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

# Retrieve tweets
tweets <- search_tweets("#Telkomsel", n = 8000, tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

tail(tweets, 20)

Text Cleaning

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

Build corpus

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents

Frequent Words

Build Term Document Matrix

tdm1 <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))

tdm1

## <<TermDocumentMatrix (terms: 658, documents: 167)>>
## Non-/sparse entries: 3314/106572
## Sparsity           : 97%
## Maximal term length: 39
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm1, lowfreq = 20)

freq.terms[1:50]

##  [1] "airasia"      "blibli"       "bpjs"         "citilink"    
##  [5] "garuda"       "hemat"        "infojakarta"  "jakpost"     
##  [9] "kompastv"     "kppu"         "lazada"       "lionair"     
## [13] "metrotv"      "namair"       "okezone"      "pegipegi"    
## [17] "selebrita"    "shopee"       "sriwijayaair" "telkomsel"   
## [21] "trans"        "transtv"      "traveloka"    "tribunnews"  
## [25] "tvone"        "antv"         "grab"         "harga"       
## [29] "indosat"      "jujur"        "beritasatu"   "halobca"     
## [33] "konsumen"     NA             NA             NA            
## [37] NA             NA             NA             NA            
## [41] NA             NA             NA             NA            
## [45] NA             NA             NA             NA            
## [49] NA             NA

term.freq <- rowSums(as.matrix(tdm1))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)

ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

## Loading required package: RColorBrewer

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
    random.order = F, colors = pal1)

INDOSAT

Extracting Tweets

Retrieve tweets from Twitter

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
##   key:    sxxdmMv0ceEXTFN0ZlqsdTcdu
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

# Retrieve tweets
tweets <- search_tweets("#Indosat", n = 8000, tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

tweets <- distinct(tweets, text, .keep_all=TRUE)

Tweets Description

ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of indihome Twitter statuses from past 3 hours",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

tail(tweets, 20)

Text Cleaning

library(tm)

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents

# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents

# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "indihome")
stopwords_id <- read.table("E://stopwords-id.txt", header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents

# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents

# keep a copy for stem completion later
myCorpusCopy <- myCorpus

Frequent Words_2

Build Term Document Matrix_2

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))

tdm

## <<TermDocumentMatrix (terms: 404, documents: 86)>>
## Non-/sparse entries: 1810/32934
## Sparsity           : 95%
## Maximal term length: 20
## Weighting          : term frequency (tf)

Top Frequent Terms_2

freq.terms <- findFreqTerms(tdm, lowfreq = 20)

freq.terms[1:50]

##  [1] "airasia"      "antv"         "blibli"       "citilink"    
##  [5] "garuda"       "grab"         "harga"        "hemat"       
##  [9] "indosat"      "infojakarta"  "jakpost"      "jujur"       
## [13] "lionair"      "metrotv"      "namair"       "okezone"     
## [17] "pegipegi"     "selebrita"    "shopee"       "sriwijayaair"
## [21] "telkomsel"    "trans"        "transtv"      "traveloka"   
## [25] "tvone"        "halobca"      "konsumen"     NA            
## [29] NA             NA             NA             NA            
## [33] NA             NA             NA             NA            
## [37] NA             NA             NA             NA            
## [41] NA             NA             NA             NA            
## [45] NA             NA             NA             NA            
## [49] NA             NA

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)

ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud_2

Build Wordcloud_2

library(wordcloud)

m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]

wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
    random.order = F, colors = pal)

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : rhenald could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : robohin could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : sinyalindosat could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : towernya could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : operator could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : pulsailang could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : pulsanya could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : serasa could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : unlimited could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : butut could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : maneh could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : barusan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq
## = 100, : jaringantanpakonplen could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : kebelakang could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : mbps could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : stagnan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : tembus could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : hmmmm could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : komplen could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : paketan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : tampa could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : kena could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : ketahuan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : menghentikan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : sedot could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : bbrp could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : dibeli could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : internasional could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : kesel could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : akses could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : internetjuaraaaa could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : hujan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : lemottttttt could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : musim could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : player could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : bekasi could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : jaya could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : screenshot could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : speedtest could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : wisma could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : limit could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : malam could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : pulatp could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : benarbenar could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : indikasi could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : kamar could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : keseringan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : mandi could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : membawa could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : memegang could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : myim could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : perasaan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : sahabat could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : tahukah could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : terkena could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : jaringanya could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : ngaruh could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : error could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : jelasin could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : jelek could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : beratus could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : darah could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : dibikin could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : dinotif could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : hentikan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : indosatooredo could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : initpstcoiqbrhoypn could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : notif could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : pesan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : pop could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : smpe could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : bebas could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : hambatan could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : jelang could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : koq could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : subuh could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : yabukannya could not be fit on page. It will not be plotted.

## Warning in wordcloud(words = names(word.freq), freq = word.freq, min.freq =
## 100, : yasinyaljelek could not be fit on page. It will not be plotted.

Text Mining - by Rosikhu Ilmi

TELKOMSEL

Extracting Tweets

Retrieve tweets from Twitter

Tweets Description

Text Cleaning

Build corpus

Frequent Words

Build Term Document Matrix

Top Frequent Terms

Wordcloud

Build Wordcloud

INDOSAT

Extracting Tweets

Retrieve tweets from Twitter

Tweets Description

Text Cleaning

Build corpus

Frequent Words_2

Build Term Document Matrix_2

Top Frequent Terms_2

Wordcloud_2

Build Wordcloud_2