Hal yang pertama dilakukan adalah membuat library yang dibutuhkan
library(rtweet)
library(quanteda)
library(katadasaR)
library(katadasaR)
library(tidyverse)
library(seededlda)
library(tidytext)
library(quanteda.textstats)
library(quanteda.textplots)
library(wordcloud2)
Proses selanjutnya adalah mengambil data twit, saya menggunakan kata kunci “holywings”.
df <- search_tweets("holywings", n = 10000, lang = "id", include_rts = T)
Data Twit yangb berupa duplikat akan dibersihkan
df.tweets <- df %>%
select(full_text) %>%
na.exclude() %>%
distinct()
rmarkdown::paged_table(data.frame(head(df.tweets)))
Saya mengubah dataframe menjadi corpus agar memudahkan dalam proses cleaning.
corpus.tweets <- df.tweets %>%
corpus(text_field = "full_text")
head(corpus.tweets)
## Corpus consisting of 6 documents.
## text1 :
## "Holywings yang dikenal sebagai usaha restoran, kelab malam, ..."
##
## text2 :
## "Holywings harus meminta maaf kepada seluruh umat Islam dalam..."
##
## text3 :
## "GP Ansor Akan Polisikan Holywings Buntut Promo Alkohol untuk..."
##
## text4 :
## "RT @Lelaki_5uny1: Breaking News....!! Konpres Kapolres Metro..."
##
## text5 :
## "RT @DPP_LIP: PERNYATAAN SIKAP BERSAMA <U+0001D405><U+0001D40F><U+0001D408>, <U+0001D406><U+0001D40D><U+0001D40F><U+0001D405> <U+0001D414><U+0001D40B><U+0001D400><U+0001D40C><U+0001D400> DAN <U+0001D40F><U+0001D400>..."
##
## text6 :
## "Polisi Tetapkan 6 Tersangka Staf Holywings, Kena Pasal Penis..."
Dikarenakan masih ada beberapa tanda baca yang tersisa, maka saya akan menggunakan fungsi tokens_remove pada chunk selanjutnya.
tokens.tweets <- corpus.tweets %>%
tokens(remove_symbols = T,
remove_url = T,
remove_numbers = T)
head(tokens.tweets)
## Tokens consisting of 6 documents.
## text1 :
## [1] "Holywings" "yang" "dikenal" "sebagai" "usaha" "restoran"
## [7] "," "kelab" "malam" "," "dan" "bar"
## [ ... and 25 more ]
##
## text2 :
## [1] "Holywings" "harus" "meminta" "maaf" "kepada" "seluruh"
## [7] "umat" "Islam" "dalam" "waktu" "x" "jam"
## [ ... and 1 more ]
##
## text3 :
## [1] "GP" "Ansor" "Akan" "Polisikan" "Holywings" "Buntut"
## [7] "Promo" "Alkohol" "untuk" "Muhammad"
##
## text4 :
## [1] "RT" "@Lelaki_5uny1" ":" "Breaking"
## [5] "News" "." "." "."
## [9] "." "!" "!" "Konpres"
## [ ... and 14 more ]
##
## text5 :
## [1] "RT" "@DPP_LIP" ":" "PERNYATAAN" "SIKAP"
## [6] "BERSAMA" "<U+0001D405><U+0001D40F><U+0001D408>" "," "<U+0001D406><U+0001D40D><U+0001D40F><U+0001D405>" "<U+0001D414><U+0001D40B><U+0001D400><U+0001D40C><U+0001D400>"
## [11] "DAN" "<U+0001D40F><U+0001D400>"
## [ ... and 6 more ]
##
## text6 :
## [1] "Polisi" "Tetapkan" "Tersangka" "Staf" "Holywings"
## [6] "," "Kena" "Pasal" "Penistaan" "Agama"
## [11] "lewat" "@KanalnewsC"
## [ ... and 1 more ]
Saya menggunakan regular expression untuk membersihkan data teks. Beberapa regular expressions digunakan untuk membersihkan emotikon, invoices, url, carriage return, mention, angka dan lain-lainnya.
tokens.tweets.clean <- tokens.tweets %>%
tokens_tolower() %>%
tokens_remove("[:punct:]", valuetype = "regex") %>%
tokens_remove("inv/[0-9]+/+[xvi]+/[xvi]+/[0-9]+", valuetype = "regex") %>%
tokens_remove("[^\x01-\x7F]", valuetype = "regex") %>%
tokens_remove("[0-9]+", valuetype = "regex") %>%
tokens_remove("[:graph:]", valuetype = "regex") %>%
tokens_remove("http[^[:space:]]*", valuetype = "regex") %>%
tokens_remove("[\r\n]", valuetype = "regex") %>%
tokens_remove("@\\w+", valuetype = "regex")
tokens.tweets.clean
## Tokens consisting of 901 documents.
## text1 :
## [1] "holywings" "yang" "dikenal" "sebagai" "usaha" "restoran"
## [7] "kelab" "malam" "dan" "bar" "membuat" "promosi"
## [ ... and 19 more ]
##
## text2 :
## [1] "holywings" "harus" "meminta" "maaf" "kepada" "seluruh"
## [7] "umat" "islam" "dalam" "waktu" "x" "jam"
##
## text3 :
## [1] "gp" "ansor" "akan" "polisikan" "holywings" "buntut"
## [7] "promo" "alkohol" "untuk" "muhammad"
##
## text4 :
## [1] "rt" "breaking" "news" "konpres" "kapolres" "metro"
## [7] "jakarta" "selatan" "soal" "promo" "alkohol" "menista"
## [ ... and 5 more ]
##
## text5 :
## [1] "rt" "pernyataan" "sikap" "bersama" "dan"
## [6] "tentang" "penodaan" "agama" "oleh" "pihak"
## [11] "holywings"
##
## text6 :
## [1] "polisi" "tetapkan" "tersangka" "staf" "holywings" "kena"
## [7] "pasal" "penistaan" "agama" "lewat"
##
## [ reached max_ndoc ... 895 more documents ]
Dikarenakan bahasa Indonesia yang digunakan sehari-hari menggunakan ejaan yang non-formal, maka setiap ejaan yang bersifat non-formal akan diformalisasi.
spelling.corrector <- spelling_lexicon <- read.csv(file = "colloquial-indonesian-lexicon.csv", header = T, stringsAsFactors = F)
slang <- spelling.corrector$slang
formal <- spelling.corrector$formal
rmarkdown::paged_table(head(data.frame(spelling.corrector)))
Proses formalisasi ejaan dimulai
tokens.tweets.clean.words <- tokens.tweets.clean %>%
tokens_replace(pattern = slang, replacement = formal)
tokens.tweets.clean.words
## Tokens consisting of 901 documents.
## text1 :
## [1] "holywings" "yang" "dikenal" "sebagai" "usaha" "restoran"
## [7] "kelab" "malam" "dan" "bar" "membuat" "promosi"
## [ ... and 19 more ]
##
## text2 :
## [1] "holywings" "harus" "meminta" "maaf" "kepada" "seluruh"
## [7] "umat" "islam" "dalam" "waktu" "kali" "jam"
##
## text3 :
## [1] "gp" "ansor" "akan" "polisikan" "holywings" "buntut"
## [7] "promo" "alkohol" "untuk" "muhammad"
##
## text4 :
## [1] "rt" "breaking" "news" "konpres" "kapolres" "metro"
## [7] "jakarta" "selatan" "soal" "promo" "alkohol" "menista"
## [ ... and 5 more ]
##
## text5 :
## [1] "rt" "pernyataan" "sikap" "bersama" "dan"
## [6] "tentang" "penodaan" "agama" "oleh" "pihak"
## [11] "holywings"
##
## text6 :
## [1] "polisi" "tetapkan" "tersangka" "staf" "holywings" "kena"
## [7] "pasal" "penistaan" "agama" "lewat"
##
## [ reached max_ndoc ... 895 more documents ]
Stemming adalah mengubah kata imbuhan menjadi kata dasar.
stemming <- function(x){
paste(lapply(x,katadasar), collapse = " ")}
tokens.tweets.clean.words <- tokens(lapply(tokens(tokens.tweets.clean.words), stemming))
tokens.tweets.clean.words <- tokens(tokenize_fasterword((tokens.tweets.clean.words)))
tokens.tweets.clean.words
## Tokens consisting of 901 documents.
## text1 :
## [1] "holywings" "yang" "kenal" "bagai" "usaha" "restoran"
## [7] "kelab" "malam" "dan" "bar" "buat" "promosi"
## [ ... and 19 more ]
##
## text2 :
## [1] "holywings" "harus" "minta" "maaf" "kepada" "seluruh"
## [7] "umat" "islam" "dalam" "waktu" "kali" "jam"
##
## text3 :
## [1] "gp" "ansor" "akan" "polis" "holywings" "buntut"
## [7] "promo" "alkohol" "untuk" "muhammad"
##
## text4 :
## [1] "rt" "breaking" "news" "konpres" "kapolres" "metro"
## [7] "jakarta" "selatan" "soal" "promo" "alkohol" "nista"
## [ ... and 5 more ]
##
## text5 :
## [1] "rt" "nyata" "sikap" "sama" "dan" "tentang"
## [7] "noda" "agama" "oleh" "pihak" "holywings"
##
## text6 :
## [1] "polisi" "tetap" "sangka" "staf" "holywings" "kena"
## [7] "pasal" "nista" "agama" "lewat"
##
## [ reached max_ndoc ... 895 more documents ]
Stopwords adalah kata berulang dan juga kata-kata yang tidak bermakna dalam proses analisis.
stopwords.id <- read.table("stopwords.txt") %>%
as.character()
head(stopwords.id)
## [1] "ada" "adalah" "adanya" "adapun" "agak" "agaknya"
Proses stopwords removal dimulai.
tokens.tweets.clean.words <- tokens.tweets.clean.words %>%
tokens_remove(stopwords.id)
tokens.tweets.clean.words
## Tokens consisting of 901 documents.
## text1 :
## [1] "holywings" "kenal" "usaha" "restoran" "kelab" "malam"
## [7] "bar" "promosi" "minum" "alkohol" "gratis" "unjung"
## [ ... and 9 more ]
##
## text2 :
## [1] "holywings" "maaf" "umat" "islam" "kali" "jam"
##
## text3 :
## [1] "gp" "ansor" "polis" "holywings" "buntut" "promo"
## [7] "alkohol" "muhammad"
##
## text4 :
## [1] "breaking" "news" "konpres" "kapolres" "metro" "jakarta"
## [7] "selatan" "promo" "alkohol" "nista" "agama" "holywings"
## [ ... and 2 more ]
##
## text5 :
## [1] "nyata" "sikap" "noda" "agama" "holywings"
##
## text6 :
## [1] "polisi" "sangka" "staf" "holywings" "kena" "pasal"
## [7] "nista" "agama"
##
## [ reached max_ndoc ... 895 more documents ]
Document-feature matrix adalah matrix yang berisi fitur atau kata sebagai kolom dan dokumen sebagai baris.
dfm <- tokens.tweets.clean.words %>%
dfm()
dfm
## Document-feature matrix of: 901 documents, 2,137 features (99.69% sparse) and 0 docvars.
## features
## docs holywings kenal usaha restoran kelab malam bar promosi minum alkohol
## text1 1 1 1 1 1 1 1 2 1 1
## text2 1 0 0 0 0 0 0 0 0 0
## text3 1 0 0 0 0 0 0 0 0 1
## text4 1 0 0 0 0 0 0 0 0 1
## text5 1 0 0 0 0 0 0 0 0 0
## text6 1 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 895 more documents, reached max_nfeat ... 2,127 more features ]
Data DFM dipotong agar tidak ada kata yang terlalu mendominasi pada wordcloud dan topic modelling.
dfm.trimmed <- dfm %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
head(dfm.trimmed)
## Document-feature matrix of: 6 documents, 430 features (98.41% sparse) and 0 docvars.
## features
## docs usaha restoran malam bar promosi minum alkohol gratis nama muhammad
## text1 1 1 1 1 2 1 1 1 1 1
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 0 0 0 0 0 1 0 0 1
## text4 0 0 0 0 0 0 1 0 0 0
## text5 0 0 0 0 0 0 0 0 0 0
## text6 0 0 0 0 0 0 0 0 0 0
## [ reached max_nfeat ... 420 more features ]
Proses pemodelan topik dimulai
tm.lda <- textmodel_lda(dfm.trimmed, k = 9)
data.frame(terms(tm.lda, 10))
Selanjutnya adalah membuat plot topic model dengan ggplot2
topwords.lda <- setNames(reshape2::melt(tm.lda$phi),c("topic","term","beta")) %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
topwords.lda[nchar(as.character(topwords.lda$term))>=3 & topwords.lda$topic!="other",] %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
Wordcloud merupakan kumpulan kata yang paling sering muncul pada data atau corpora.
# wordcloud
word_frequency <- dfm %>%
textstat_frequency(n = 100) %>%
as.data.frame()
wordcloud2(word_frequency, size = 7, minRotation = -pi/6, maxRotation = -pi/6,
rotateRatio = 0)
fcm.tweets <- fcm(dfm.trimmed)
feat <- fcm.tweets %>%
topfeatures(100) %>%
names()
fcm_selected <- fcm_select(fcm.tweets, pattern = feat, selection = "keep")
dim(fcm_selected)
## [1] 100 100
topfeatures(fcm_selected)
## people holy berlaku syarat giveaway penuh ecek dasar
## 199 192 173 160 157 148 134 128
## info event
## 119 106
Text Network adalah plot yang menunjukkan hubungan antar setiap kata pada setiap unit analisis atau dokumen.
size <- log(colSums(dfm_select(dfm, feat, selection = "keep")))
set.seed(144)
textplot_network(fcm_selected, min_freq = 0.3, vertex_size = size / max(size) * 3)
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Benoit, K., Watanabe, K., Wang, H., Lua, J. W., & Kuha, J. (2021). Package “quanteda. textstats.” Research Bulletin, 27(2), 37–54. Benoit, K., Watanabe, K., Wang, H., Nulty, P., Obeng, A., Müller, S., & Matsuo, A. (2018). quanteda: An R package for the quantitative analysis of textual data. Journal of Open Source Software, 3(30), 774. https://doi.org/10.21105/joss.00774 Kearney, M. (2019). rtweet: Collecting and analyzing Twitter data. Journal of Open Source Software, 4(42), 1829. https://doi.org/10.21105/joss.01829 Lang, D., Chien, G., & Lang, D. (2018). Package “wordcloud2.” Watanabe, K., & Xuan-Hieu, P. (2022). Package “seededla.” Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L., François, R., Grolemund, G., Hayes, A., Henry, L., Hester, J., Kuhn, M., Pedersen, T., Miller, E., Bache, S., Müller, K., Ooms, J., Robinson, D., Seidel, D., Spinu, V., … Yutani, H. (2019). Welcome to the Tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686