This project analyzes the tweets of two Indonesia television channel; Indosiar and TVOne, thereby giving out twitter handles involved in conversation, hastags used and words occured.
It is fairly easy to create Twitter analysis in R. We first need a few packages to kick things off.
# Load packages
library(rtweet)
library(tidyverse)
# Twitter authentication
create_token(
app = "my_twitter_research_app",
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
## key: koZFJbYVk7gagYEGLaUN6RmTb
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
This is achieved using the rtweet package. This package offers a number of functions to access the twitter API. Here, the searchTwitter function is used. In this case, a search for 5000 tweets containing the pattern ‘Indosiar’ and ‘TVOne’ is carried out. The value returned is a list of 5000 tweet ‘tweets1’ and ‘tweets2’ objects.
# Retrieve tweets
tweets1 <- search_tweets("indosiar", n = 5000, tweet_mode="extended")
## Searching for tweets...
## Finished collecting tweets!
tweets1 <- distinct(tweets1, text, .keep_all=TRUE)
tweets2 <- search_tweets("tvone", n = 5000, tweet_mode="extended")
## Searching for tweets...
## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded
## Finished collecting tweets!
tweets2 <- distinct(tweets2, text, .keep_all=TRUE)
## plot time series of tweets
ts_plot(tweets1, "3 hours") +
theme_minimal() +
theme(plot.title = ggplot2::element_text(face = "bold")) +
labs(
x = NULL, y = NULL,
title = "Frequency of Indosiar Twitter statuses from past 9 days",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
From time series chart above, Indosiar tweets shows some patterns. From past 9 days, November 9 has the highest tweets count. It indicates on that day Indosiar was showing a live broadcast for Liga 1 Persebaya vs PSM.
ts_plot(tweets2, "3 hours") +
theme_minimal() +
theme(plot.title = ggplot2::element_text(face = "bold")) +
labs(
x = NULL, y = NULL,
title = "Frequency of TVOne Twitter statuses from past 5 days",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
From time series chart above, TVOne tweets shows no patterns. From past 5 days, November 9 has the highest tweets count. It indicates on that day TVOne was also showing a live broadcast for Liga 1 Persebaya vs PSM.
head(tweets1, 20)
head(tweets2, 20)
library(tm)
# build a corpus, and specify the source to be character vectors
myCorpus1 <- Corpus(VectorSource(tweets1$text))
myCorpus2 <- Corpus(VectorSource(tweets2$text))
#convert to lower case
myCorpus1 <- tm_map(myCorpus1, content_transformer(tolower))
myCorpus2 <- tm_map(myCorpus2, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeURL))
myCorpus2 <- tm_map(myCorpus2, content_transformer(removeURL))
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeNumPunct))
myCorpus2 <- tm_map(myCorpus2, content_transformer(removeNumPunct))
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via")
stopword_id <- read.table('stopwords-id.txt', header=FALSE)
myStopwords <- c(myStopwords, as.matrix(stopword_id$V1), "hi", "yg", "indosiar", "tvone", "tvonenews")
myCorpus1 <- tm_map(myCorpus1, removeWords, myStopwords)
myCorpus2 <- tm_map(myCorpus2, removeWords, myStopwords)
# remove extra whitespace
myCorpus1 <- tm_map(myCorpus1, stripWhitespace)
myCorpus2 <- tm_map(myCorpus2, stripWhitespace)
# keep a copy for stem completion later
myCorpusCopy1 <- myCorpus1
myCorpusCopy2 <- myCorpus2
tdm1 <- TermDocumentMatrix(myCorpus1, control = list(wordLengths = c(1, Inf)))
tdm2 <- TermDocumentMatrix(myCorpus2, control = list(wordLengths = c(1, Inf)))
tdm1
## <<TermDocumentMatrix (terms: 7798, documents: 2396)>>
## Non-/sparse entries: 25533/18658475
## Sparsity : 100%
## Maximal term length: 126
## Weighting : term frequency (tf)
tdm2
## <<TermDocumentMatrix (terms: 6239, documents: 1652)>>
## Non-/sparse entries: 21326/10285502
## Sparsity : 100%
## Maximal term length: 39
## Weighting : term frequency (tf)
freq.terms1 <- findFreqTerms(tdm1, lowfreq = 20)
freq.terms2 <- findFreqTerms(tdm2, lowfreq = 20)
freq.terms1[1:50]
## [1] "kaya" "sinetron" "ya" "mnc"
## [5] "ntar" "jumat" "ko" "live"
## [9] "makassar" "persija" "pertandingan" "psm"
## [13] "vs" "wib" "indosiarid" "jakarta"
## [17] "kali" "liga" "persebaya" "saksikan"
## [21] "surabaya" "tv" "vidio" "kick"
## [25] "match" "minggu" "november" "stadion"
## [29] "ps" "sabtu" "tira" "azab"
## [33] "nya" "udah" "film" "dangdut"
## [37] "main" "cinta" "hidup" "kisah"
## [41] "nonton" "jaman" "koreanthingy" "naga"
## [45] "nih" "pas" "gojekliga" "mania"
## [49] "persib" "persibday"
freq.terms2[1:50]
## [1] "buy" "cool" "cube" "ice"
## [5] "j" "ll" "networks" "teaming"
## [9] "television" "rockygerung" "akimalamtvone" "cerdas"
## [13] "genderuwo" "jokowi" "mencerdaskan" "politik"
## [17] "sontoloyo" "sy" "tdk" "rocky"
## [21] "wib" "berita" "gak" "lihat"
## [25] "one" "rakyat" "tv" "amp"
## [29] "android" "connect" "ios" "jam"
## [33] "kabar" "live" "saksikan" "selengkapnya"
## [37] "streaming" "harian" "update" "fakta"
## [41] "new" "november" "sabtu" "vs"
## [45] "world" "tonton" "menarik" "ya"
## [49] "download" "program"
term.freq1 <- rowSums(as.matrix(tdm1))
term.freq1 <- subset(term.freq1, term.freq1 >= 70)
term.freq2 <- rowSums(as.matrix(tdm2))
term.freq2 <- subset(term.freq2, term.freq2 >= 70)
df1 <- data.frame(term = names(term.freq1), freq = term.freq1)
df2 <- data.frame(term = names(term.freq2), freq = term.freq2)
ggplot(df1, aes(x=term, y=freq)) + geom_bar(aes(fill = term), stat="identity") +
theme(legend.position = "none") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
The chart above interprets words that are likely get tweeted by twitter user. The highest count word is ‘azab’, this is indicates an inside jokes for Indonesian twitter users based on Indosiar most phenomenal TV Show.
ggplot(df2, aes(x=term, y=freq)) + geom_bar(aes(fill = term), stat="identity") +
theme(legend.position = "none") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
The chart above interprets words that are likely get tweeted by twitter user. The highest count word ‘akimalamtvone’. Akimalamtvone is shortened for Apa Kabar Indonesia Malam TV One, the most hits show from TV One.
library(wordcloud)
m1 <- as.matrix(tdm1)
m2 <- as.matrix(tdm2)
# calculate the frequency of words and sort it by frequency
word.freq1 <- sort(rowSums(m1), decreasing = T)
word.freq2 <- sort(rowSums(m2), decreasing = T)
# colors
pal1 <- brewer.pal(9, "BuGn")[-(1:4)]
pal2 <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq1), freq = word.freq1, min.freq = 20,
random.order = F, colors = pal1)
Wordcloud for Indosiar are likely filled with entertainment terms such as Azab, Sinetron, Nonton, FTV, and many more.
wordcloud(words = names(word.freq2), freq = word.freq2, min.freq = 20,
random.order = F, colors = pal2)
In the other hands, TV One’s wordcloud are filled with some politician names such as Fadli Zon, Jokowi, Prabowo, and many more.