This project analyzes the tweets of two Indonesia television channel; Indosiar and TVOne, thereby giving out twitter handles involved in conversation, hastags used and words occured.

Get the Tweets

This is achieved using the rtweet package. This package offers a number of functions to access the twitter API. Here, the searchTwitter function is used. In this case, a search for 5000 tweets containing the pattern ‘Indosiar’ and ‘TVOne’ is carried out. The value returned is a list of 5000 tweet ‘tweets1’ and ‘tweets2’ objects.

# Retrieve tweets
tweets1 <- search_tweets("indosiar", n = 5000, tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

tweets1 <- distinct(tweets1, text, .keep_all=TRUE)
tweets2 <- search_tweets("tvone", n = 5000, tweet_mode="extended")

## Searching for tweets...

## Warning: Rate limit exceeded - 88

## Warning: Rate limit exceeded

## Finished collecting tweets!

tweets2 <- distinct(tweets2, text, .keep_all=TRUE)

Tweets Description - Time Series

## plot time series of tweets
ts_plot(tweets1, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of Indosiar Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

From time series chart above, Indosiar tweets shows some patterns. From past 9 days, November 9 has the highest tweets count. It indicates on that day Indosiar was showing a live broadcast for Liga 1 Persebaya vs PSM.

ts_plot(tweets2, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of TVOne Twitter statuses from past 5 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

From time series chart above, TVOne tweets shows no patterns. From past 5 days, November 9 has the highest tweets count. It indicates on that day TVOne was also showing a live broadcast for Liga 1 Persebaya vs PSM.

head(tweets1, 20)

head(tweets2, 20)

Text Cleaning

library(tm)

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus1 <- Corpus(VectorSource(tweets1$text))
myCorpus2 <- Corpus(VectorSource(tweets2$text))
#convert to lower case
myCorpus1 <- tm_map(myCorpus1, content_transformer(tolower))
myCorpus2 <- tm_map(myCorpus2, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeURL))
myCorpus2 <- tm_map(myCorpus2, content_transformer(removeURL))
# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeNumPunct))
myCorpus2 <- tm_map(myCorpus2, content_transformer(removeNumPunct))
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via")
stopword_id <- read.table('stopwords-id.txt', header=FALSE)
myStopwords <- c(myStopwords, as.matrix(stopword_id$V1), "hi", "yg", "indosiar", "tvone", "tvonenews")
myCorpus1 <- tm_map(myCorpus1, removeWords, myStopwords)
myCorpus2 <- tm_map(myCorpus2, removeWords, myStopwords)
# remove extra whitespace
myCorpus1 <- tm_map(myCorpus1, stripWhitespace)
myCorpus2 <- tm_map(myCorpus2, stripWhitespace)
# keep a copy for stem completion later
myCorpusCopy1 <- myCorpus1
myCorpusCopy2 <- myCorpus2

Frequent Words

Build Term Document Matrix

tdm1 <- TermDocumentMatrix(myCorpus1, control = list(wordLengths = c(1, Inf)))
tdm2 <- TermDocumentMatrix(myCorpus2, control = list(wordLengths = c(1, Inf)))

tdm1

## <<TermDocumentMatrix (terms: 7798, documents: 2396)>>
## Non-/sparse entries: 25533/18658475
## Sparsity           : 100%
## Maximal term length: 126
## Weighting          : term frequency (tf)

tdm2

## <<TermDocumentMatrix (terms: 6239, documents: 1652)>>
## Non-/sparse entries: 21326/10285502
## Sparsity           : 100%
## Maximal term length: 39
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms1 <- findFreqTerms(tdm1, lowfreq = 20)
freq.terms2 <- findFreqTerms(tdm2, lowfreq = 20)

freq.terms1[1:50]

##  [1] "kaya"         "sinetron"     "ya"           "mnc"         
##  [5] "ntar"         "jumat"        "ko"           "live"        
##  [9] "makassar"     "persija"      "pertandingan" "psm"         
## [13] "vs"           "wib"          "indosiarid"   "jakarta"     
## [17] "kali"         "liga"         "persebaya"    "saksikan"    
## [21] "surabaya"     "tv"           "vidio"        "kick"        
## [25] "match"        "minggu"       "november"     "stadion"     
## [29] "ps"           "sabtu"        "tira"         "azab"        
## [33] "nya"          "udah"         "film"         "dangdut"     
## [37] "main"         "cinta"        "hidup"        "kisah"       
## [41] "nonton"       "jaman"        "koreanthingy" "naga"        
## [45] "nih"          "pas"          "gojekliga"    "mania"       
## [49] "persib"       "persibday"

freq.terms2[1:50]

##  [1] "buy"           "cool"          "cube"          "ice"          
##  [5] "j"             "ll"            "networks"      "teaming"      
##  [9] "television"    "rockygerung"   "akimalamtvone" "cerdas"       
## [13] "genderuwo"     "jokowi"        "mencerdaskan"  "politik"      
## [17] "sontoloyo"     "sy"            "tdk"           "rocky"        
## [21] "wib"           "berita"        "gak"           "lihat"        
## [25] "one"           "rakyat"        "tv"            "amp"          
## [29] "android"       "connect"       "ios"           "jam"          
## [33] "kabar"         "live"          "saksikan"      "selengkapnya" 
## [37] "streaming"     "harian"        "update"        "fakta"        
## [41] "new"           "november"      "sabtu"         "vs"           
## [45] "world"         "tonton"        "menarik"       "ya"           
## [49] "download"      "program"

term.freq1 <- rowSums(as.matrix(tdm1))
term.freq1 <- subset(term.freq1, term.freq1 >= 70)
term.freq2 <- rowSums(as.matrix(tdm2))
term.freq2 <- subset(term.freq2, term.freq2 >= 70)
df1 <- data.frame(term = names(term.freq1), freq = term.freq1)
df2 <- data.frame(term = names(term.freq2), freq = term.freq2)

ggplot(df1, aes(x=term, y=freq)) + geom_bar(aes(fill = term), stat="identity") +
  theme(legend.position = "none") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

The chart above interprets words that are likely get tweeted by twitter user. The highest count word is ‘azab’, this is indicates an inside jokes for Indonesian twitter users based on Indosiar most phenomenal TV Show.

ggplot(df2, aes(x=term, y=freq)) + geom_bar(aes(fill = term), stat="identity") +
  theme(legend.position = "none") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

The chart above interprets words that are likely get tweeted by twitter user. The highest count word ‘akimalamtvone’. Akimalamtvone is shortened for Apa Kabar Indonesia Malam TV One, the most hits show from TV One.

Wordcloud

Build Wordcloud

library(wordcloud)

m1 <- as.matrix(tdm1)
m2 <- as.matrix(tdm2)
# calculate the frequency of words and sort it by frequency 
word.freq1 <- sort(rowSums(m1), decreasing = T)
word.freq2 <- sort(rowSums(m2), decreasing = T)
# colors
pal1 <- brewer.pal(9, "BuGn")[-(1:4)]
pal2 <- brewer.pal(9, "BuGn")[-(1:4)]

wordcloud(words = names(word.freq1), freq = word.freq1, min.freq = 20,
    random.order = F, colors = pal1)

Wordcloud for Indosiar are likely filled with entertainment terms such as Azab, Sinetron, Nonton, FTV, and many more.

wordcloud(words = names(word.freq2), freq = word.freq2, min.freq = 20,
    random.order = F, colors = pal2)

In the other hands, TV One’s wordcloud are filled with some politician names such as Fadli Zon, Jokowi, Prabowo, and many more.

Indonesia Television Channel - Twitter Analysis

Nur Fidyah Permatasari (06211540000087)

November 12, 2018

Extracting Tweets

Package installation