Text Mining ini dilakukan dengan membandingkan hasil visualisasi text data provider “TELKOMSEL” dengan “INDOSAT”. Hasil visualisasinya akan ditampilkan sebagai berikut.
# Load packages
library(rtweet)
library(tidyverse)
# Access token and APIs
consumer_key <- "sxxdmMv0ceEXTFN0ZlqsdTcdu"
consumer_secret <- "6Ywu6aF9D4f684tv3bhT6j2bt31Te6pUbDqCfUKjoXXiCG2Lq9"
access_token <- "410739589-a3DjrTrEOD6LQuRkXOfaHjxzbnsMRp9ReeHOxXCT"
access_secret <- "DFykQwVLaXtILZ2Yph8gmS49XoIpooa8AGGcRUbcmyInw"
# Twitter authentication
create_token(
app = "my_twitter_research_app",
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
<Token>
<oauth_endpoint>
request: https://api.twitter.com/oauth/request_token
authorize: https://api.twitter.com/oauth/authenticate
access: https://api.twitter.com/oauth/access_token
<oauth_app> my_twitter_research_app
key: sxxdmMv0ceEXTFN0ZlqsdTcdu
secret: <hidden>
<credentials> oauth_token, oauth_token_secret
---
# Retrieve tweets
tweets1 <- search_tweets("Telkomsel", n = 8000, tweet_mode="extended")
Searching for tweets...
Rate limit exceeded - 88Rate limit exceededFinished collecting tweets!
tweets1 <- distinct(tweets1, text, .keep_all=TRUE)
Trying to compute distinct() for variables not found in the data:
- `text`
This is an error, but only a warning is raised for compatibility reasons.
The operation will return the input unchanged.
ts_plot(tweets1, "3 hours") +
theme_minimal() +
theme(plot.title = ggplot2::element_text(face = "bold")) +
labs(
x = NULL, y = NULL,
title = "Frequency of Telkomsel Twitter statuses from past 3 hours",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
Error: no datetime (POSIXct) var found
tail(tweets1, 20)
# build a corpus, and specify the source to be character vectors
myCorpus1 <- Corpus(VectorSource(tweets1$text))
# convert to lower case
myCorpus1 <- tm_map(myCorpus1, content_transformer(tolower))
transformation drops documents
# remove URLs
removeURL1 <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeURL1))
transformation drops documents
# remove anything other than English letters or space
removeNumPunct1 <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeNumPunct1))
transformation drops documents
# remove stopwords
myStopwords1 <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "cepat", "via", "data", "Telkomsel")
stopwords_id <- read.table("E://stopwords-id.txt", header = FALSE)
myStopwords1 <- c(myStopwords1, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus1 <- tm_map(myCorpus1, removeWords, myStopwords1)
transformation drops documents
# remove extra whitespace
myCorpus1 <- tm_map(myCorpus1, stripWhitespace)
transformation drops documents
# keep a copy for stem completion later
myCorpusCopy1 <- myCorpus1
tdm1 <- TermDocumentMatrix(myCorpus1, control = list(wordLengths = c(1, Inf)))
tdm1
<<TermDocumentMatrix (terms: 5310, documents: 3480)>>
Non-/sparse entries: 45549/18433251
Sparsity : 100%
Maximal term length: 41
Weighting : term frequency (tf)
freq.terms1 <- findFreqTerms(tdm1, lowfreq = 20)
freq.terms1[1:100]
term.freq1 <- rowSums(as.matrix(tdm1))
term.freq1 <- subset(term.freq1, term.freq1 >= 150)
df1 <- data.frame(term1 = names(term.freq1), freq1 = term.freq1)
ggplot(df1, aes(x=term1, y=freq1)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
library(wordcloud)
m1 <- as.matrix(tdm1)
# calculate the frequency of words and sort it by frequency
word.freq1 <- sort(rowSums(m1), decreasing = T)
# colors
pal1 <- brewer.pal(9, "BuGn")[-(1:4)]
TELKOMSEL = wordcloud(words = names(word.freq1), freq = word.freq1, min.freq = 100,
random.order = F, colors = pal1)
# Load packages
library(rtweet)
library(tidyverse)
# Access token and APIs
consumer_key <- "sxxdmMv0ceEXTFN0ZlqsdTcdu"
consumer_secret <- "6Ywu6aF9D4f684tv3bhT6j2bt31Te6pUbDqCfUKjoXXiCG2Lq9"
access_token <- "410739589-a3DjrTrEOD6LQuRkXOfaHjxzbnsMRp9ReeHOxXCT"
access_secret <- "DFykQwVLaXtILZ2Yph8gmS49XoIpooa8AGGcRUbcmyInw"
# Twitter authentication
create_token(
app = "my_twitter_research_app",
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
<Token>
<oauth_endpoint>
request: https://api.twitter.com/oauth/request_token
authorize: https://api.twitter.com/oauth/authenticate
access: https://api.twitter.com/oauth/access_token
<oauth_app> my_twitter_research_app
key: sxxdmMv0ceEXTFN0ZlqsdTcdu
secret: <hidden>
<credentials> oauth_token, oauth_token_secret
---
# Retrieve tweets
tweets2 <- search_tweets("Indosat", n =1000, tweet_mode="extended")
Searching for tweets...
Rate limit exceeded - 88Rate limit exceededFinished collecting tweets!
tweets2 <- distinct(tweets2, text, .keep_all=TRUE)
Trying to compute distinct() for variables not found in the data:
- `text`
This is an error, but only a warning is raised for compatibility reasons.
The operation will return the input unchanged.
ts_plot(tweets, "3 hours") +
theme_minimal() +
theme(plot.title = ggplot2::element_text(face = "bold")) +
labs(
x = NULL, y = NULL,
title = "Frequency of Indosat Twitter statuses from past 3 hours",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
Error: no datetime (POSIXct) var found
tail(tweets, 20)
library(tm)
library(NLP)
# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp", "indihome")
stopwords_id <- read.table("E://stopwords-id.txt", header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
transformation drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
<<TermDocumentMatrix (terms: 11372, documents: 5314)>>
Non-/sparse entries: 74440/60356368
Sparsity : 100%
Maximal term length: 48
Weighting : term frequency (tf)
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
[1] "beli" "gb" "indosat" "indosatcare" "paketan" "rb"
[7] "bagus" "gak" "ya" "aksesnya" "coba" "dm"
[13] "hpnya" "jaringan" "kak" "manual" "masuk" "nomornya"
[19] "operator" "pengaturan" "pindah" "pindahkan" "rio" "setting"
[25] "sinyalnya" "thanks" "blm" "detail" "diubah" "kondisi"
[31] "lokasi" "nomor" "otomatis" "pengguna" "pilih" "salamchun"
[37] "sukses" "hai" "mey" "akses" "bantu" "berkendala"
[43] "cek" "indy" "internet" "jalan" "kecamatan" "keterangan"
[49] "kota" "mengalami"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 150)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
library(wordcloud)
library(RColorBrewer)
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
INDOSAT = wordcloud(words = names(word.freq), freq = word.freq, min.freq = 100,
random.order = F, colors = pal)
print("TELKOMSEL")
print("INDOSAT")