Hal yang pertama dilakukan adalah membuat library yang dibutuhkan
library(rtweet)
library(quanteda)
library(katadasaR)
library(katadasaR)
library(tidyverse)
library(seededlda)
library(tidytext)
library(quanteda.textstats)
library(quanteda.textplots)
library(wordcloud2)
Proses selanjutnya adalah mengambil data twit, saya menggunakan kata kunci “#BapakPolitikIdentitas”.
df <- search_tweets("#BapakPolitikIdentitas", n = 10000, lang = "id", include_rts = T)
Data Twit yangb berupa duplikat akan dibersihkan
df.tweets <- df %>%
select(full_text) %>%
na.exclude() %>%
distinct()
rmarkdown::paged_table(data.frame(head(df.tweets)))
Saya mengubah dataframe menjadi corpus agar memudahkan dalam proses cleaning.
corpus.tweets <- df.tweets %>%
corpus(text_field = "full_text")
head(corpus.tweets)
## Corpus consisting of 6 documents.
## text1 :
## "RT @MahendraXY: Satu geng nih <U+0001F60F> #BapakPolitikIdentitas https..."
##
## text2 :
## "@YRadianto gaspoool!!jangan kasi kendor ...jejak digital #Ba..."
##
## text3 :
## "Hanya orang bodoh yg diam saja dikibuli #BapakPolitikIdentit..."
##
## text4 :
## "Jejak Digital #BapakPolitikIdentitas https://t.co/eOVkkN7mm..."
##
## text5 :
## "@Mdy_Asmara1701 Secara gak langsung elu mengakui kalo junjun..."
##
## text6 :
## "Sah, Ahok Ditetapkan Netizen Sebagai Bapak Politik Identitas..."
Dikarenakan masih ada beberapa tanda baca yang tersisa, maka saya akan menggunakan fungsi tokens_remove pada chunk selanjutnya.
tokens.tweets <- corpus.tweets %>%
tokens(remove_symbols = T,
remove_url = T,
remove_numbers = T)
head(tokens.tweets)
## Tokens consisting of 6 documents.
## text1 :
## [1] "RT" "@MahendraXY" ":"
## [4] "Satu" "geng" "nih"
## [7] "#BapakPolitikIdentitas"
##
## text2 :
## [1] "@YRadianto" "gaspoool" "!" "!" "jangan"
## [6] "kasi" "kendor" "." "." "."
## [11] "jejak" "digital"
## [ ... and 5 more ]
##
## text3 :
## [1] "Hanya" "orang" "bodoh"
## [4] "yg" "diam" "saja"
## [7] "dikibuli" "#BapakPolitikIdentitas" "berulang"
## [10] "kali" "."
##
## text4 :
## [1] "Jejak" "Digital" "#BapakPolitikIdentitas"
##
## text5 :
## [1] "@Mdy_Asmara1701" "Secara" "gak"
## [4] "langsung" "elu" "mengakui"
## [7] "kalo" "junjungan" "lu"
## [10] "#BapakPolitikIdentitas" "." "Intinya"
## [ ... and 6 more ]
##
## text6 :
## [1] "Sah" "," "Ahok"
## [4] "Ditetapkan" "Netizen" "Sebagai"
## [7] "Bapak" "Politik" "Identitas"
## [10] "," "Tagar" "#BapakPolitikIdentitas"
## [ ... and 9 more ]
Saya menggunakan regular expression untuk membersihkan data teks. Beberapa regular expressions digunakan untuk membersihkan emotikon, invoices, url, carriage return, mention, angka dan lain-lainnya.
tokens.tweets.clean <- tokens.tweets %>%
tokens_tolower() %>%
tokens_remove("[:punct:]", valuetype = "regex") %>%
tokens_remove("inv/[0-9]+/+[xvi]+/[xvi]+/[0-9]+", valuetype = "regex") %>%
tokens_remove("[^\x01-\x7F]", valuetype = "regex") %>%
tokens_remove("[0-9]+", valuetype = "regex") %>%
tokens_remove("[:graph:]", valuetype = "regex") %>%
tokens_remove("http[^[:space:]]*", valuetype = "regex") %>%
tokens_remove("[\r\n]", valuetype = "regex") %>%
tokens_remove("@\\w+", valuetype = "regex")
tokens.tweets.clean
## Tokens consisting of 2,583 documents.
## text1 :
## [1] "rt" "satu" "geng" "nih"
##
## text2 :
## [1] "gaspoool" "jangan" "kasi" "kendor" "jejak"
## [6] "digital" "bertebaran"
##
## text3 :
## [1] "hanya" "orang" "bodoh" "yg" "diam" "saja" "dikibuli"
## [8] "berulang" "kali"
##
## text4 :
## [1] "jejak" "digital"
##
## text5 :
## [1] "secara" "gak" "langsung" "elu" "mengakui" "kalo"
## [7] "junjungan" "lu" "intinya" "mah" "gak" "mau"
## [ ... and 2 more ]
##
## text6 :
## [1] "sah" "ahok" "ditetapkan" "netizen" "sebagai"
## [6] "bapak" "politik" "identitas" "tagar" "menggema"
## [11] "di" "twitter"
## [ ... and 5 more ]
##
## [ reached max_ndoc ... 2,577 more documents ]
Dikarenakan bahasa Indonesia yang digunakan sehari-hari menggunakan ejaan yang non-formal, maka setiap ejaan yang bersifat non-formal akan diformalisasi.
spelling.corrector <- spelling_lexicon <- read.csv(file = "colloquial-indonesian-lexicon.csv", header = T, stringsAsFactors = F)
slang <- spelling.corrector$slang
formal <- spelling.corrector$formal
rmarkdown::paged_table(head(data.frame(spelling.corrector)))
Proses formalisasi ejaan dimulai
tokens.tweets.clean.words <- tokens.tweets.clean %>%
tokens_replace(pattern = slang, replacement = formal)
tokens.tweets.clean.words
## Tokens consisting of 2,583 documents.
## text1 :
## [1] "rt" "satu" "geng" "nih"
##
## text2 :
## [1] "gaspoool" "jangan" "kasih" "kendor" "jejak"
## [6] "digital" "bertebaran"
##
## text3 :
## [1] "hanya" "orang" "bodoh" "yang" "diam" "saja" "dikibuli"
## [8] "berulang" "kali"
##
## text4 :
## [1] "jejak" "digital"
##
## text5 :
## [1] "secara" "enggak" "langsung" "lu" "mengakui" "kalo"
## [7] "junjungan" "lu" "intinya" "mah" "enggak" "mau"
## [ ... and 2 more ]
##
## text6 :
## [1] "sah" "ahok" "ditetapkan" "netizen" "sebagai"
## [6] "bapak" "politik" "identitas" "tagar" "menggema"
## [11] "di" "twitter"
## [ ... and 5 more ]
##
## [ reached max_ndoc ... 2,577 more documents ]
Stemming adalah mengubah kata imbuhan menjadi kata dasar.
stemming <- function(x){
paste(lapply(x,katadasar), collapse = " ")}
tokens.tweets.clean.words <- tokens(lapply(tokens(tokens.tweets.clean.words), stemming))
tokens.tweets.clean.words <- tokens(tokenize_fasterword((tokens.tweets.clean.words)))
tokens.tweets.clean.words
## Tokens consisting of 2,583 documents.
## text1 :
## [1] "rt" "satu" "geng" "nih"
##
## text2 :
## [1] "gaspoool" "jangan" "kasih" "kendor" "jejak" "digital" "tebar"
##
## text3 :
## [1] "hanya" "orang" "bodoh" "yang" "diam" "saja" "kibul" "ulang" "kali"
##
## text4 :
## [1] "jejak" "digital"
##
## text5 :
## [1] "cara" "enggak" "langsung" "lu" "aku" "kalo"
## [7] "junjung" "lu" "inti" "mah" "enggak" "mau"
## [ ... and 2 more ]
##
## text6 :
## [1] "sah" "ahok" "tetap" "netizen" "bagai" "bapak"
## [7] "politik" "identitas" "tagar" "gema" "di" "twitter"
## [ ... and 5 more ]
##
## [ reached max_ndoc ... 2,577 more documents ]
Stopwords adalah kata berulang dan juga kata-kata yang tidak bermakna dalam proses analisis.
stopwords.id <- read.table("stopwords.txt") %>%
as.character()
head(stopwords.id)
## [1] "ada" "adalah" "adanya" "adapun" "agak" "agaknya"
Proses stopwords removal dimulai.
tokens.tweets.clean.words <- tokens.tweets.clean.words %>%
tokens_remove(stopwords.id)
tokens.tweets.clean.words
## Tokens consisting of 2,583 documents.
## text1 :
## [1] "geng"
##
## text2 :
## [1] "gaspoool" "kasih" "kendor" "jejak" "digital" "tebar"
##
## text3 :
## [1] "orang" "bodoh" "diam" "kibul" "ulang" "kali"
##
## text4 :
## [1] "jejak" "digital"
##
## text5 :
## [1] "langsung" "junjung" "inti" "mah" "kotor"
##
## text6 :
## [1] "sah" "ahok" "netizen" "politik" "identitas" "tagar"
## [7] "gema" "twitter" "hoax"
##
## [ reached max_ndoc ... 2,577 more documents ]
Document-feature matrix adalah matrix yang berisi fitur atau kata sebagai kolom dan dokumen sebagai baris.
dfm <- tokens.tweets.clean.words %>%
dfm()
dfm
## Document-feature matrix of: 2,583 documents, 3,484 features (99.80% sparse) and 0 docvars.
## features
## docs geng gaspoool kasih kendor jejak digital tebar orang bodoh diam
## text1 1 0 0 0 0 0 0 0 0 0
## text2 0 1 1 1 1 1 1 0 0 0
## text3 0 0 0 0 0 0 0 1 1 1
## text4 0 0 0 0 1 1 0 0 0 0
## text5 0 0 0 0 0 0 0 0 0 0
## text6 0 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 2,577 more documents, reached max_nfeat ... 3,474 more features ]
Data DFM dipotong agar tidak ada kata yang terlalu mendominasi pada wordcloud dan topic modelling.
dfm.trimmed <- dfm %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
head(dfm.trimmed)
## Document-feature matrix of: 6 documents, 745 features (99.49% sparse) and 0 docvars.
## features
## docs kasih jejak digital tebar orang bodoh diam ulang kali langsung
## text1 0 0 0 0 0 0 0 0 0 0
## text2 1 1 1 1 0 0 0 0 0 0
## text3 0 0 0 0 1 1 1 1 1 0
## text4 0 1 1 0 0 0 0 0 0 0
## text5 0 0 0 0 0 0 0 0 0 1
## text6 0 0 0 0 0 0 0 0 0 0
## [ reached max_nfeat ... 735 more features ]
Proses pemodelan topik dimulai
tm.lda <- textmodel_lda(dfm.trimmed, k = 9)
data.frame(terms(tm.lda, 10))
## topic1 topic2 topic3 topic4 topic5 topic6 topic7
## 1 jakarta the jejak identitas warga indonesia orang
## 2 selamat hasil digital dki nama baswedan wan
## 3 holywings putrasiregar lupa agama jalan anis ahok
## 4 mandiri selamat dukung pilkada ganti identitas lihat
## 5 cinta jakarta tolak gubernur kerja ahok nyata
## 6 papua jokowi cuci nkri bikin tagar gelar
## 7 maria adha identitas menang susah netizen karya
## 8 widi idul buzzer bangsa jakarta trending makna
## 9 alphard litbang lawan ayat bahagia alam pakai
## 10 chris alphard hapus paham dki indentitas omong
## topic8 topic9
## 1 sang pilih
## 2 cebong dukung
## 3 unpad nasdem
## 4 kadrun amp
## 5 sampah presiden
## 6 tukang partai
## 7 ngibul capres
## 8 peluk rakyat
## 9 sabrina tinggal
## 10 carpenter biar
Selanjutnya adalah membuat plot topic model dengan ggplot2
topwords.lda <- setNames(reshape2::melt(tm.lda$phi),c("topic","term","beta")) %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
topwords.lda[nchar(as.character(topwords.lda$term))>=3 & topwords.lda$topic!="other",] %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = F) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
Wordcloud merupakan kumpulan kata yang paling sering muncul pada data atau corpora.
# wordcloud
word_frequency <- dfm %>%
textstat_frequency(n = 100) %>%
as.data.frame()
wordcloud2(word_frequency, size = 1, minRotation = -pi/6, maxRotation = -pi/6,
rotateRatio = 0)
fcm.tweets <- fcm(dfm.trimmed)
feat <- fcm.tweets %>%
topfeatures(100) %>%
names()
fcm_selected <- fcm_select(fcm.tweets, pattern = feat, selection = "keep")
dim(fcm_selected)
## [1] 100 100
topfeatures(fcm_selected)
## indonesia adha mandiri show idul dream
## 2026 1332 1301 1269 1248 1213
## putrasiregar laura cinta pogba
## 1051 1021 958 947
Text Network adalah plot yang menunjukkan hubungan antar setiap kata pada setiap unit analisis atau dokumen.
size <- log(colSums(dfm_select(dfm, feat, selection = "keep")))
set.seed(144)
textplot_network(fcm_selected, min_freq = 0.5, vertex_size = size / max(size) * 3)
## Warning: ggrepel: 39 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Benoit, K., Watanabe, K., Wang, H., Lua, J. W., & Kuha, J. (2021). Package “quanteda. textstats.” Research Bulletin, 27(2), 37–54.
Benoit, K., Watanabe, K., Wang, H., Nulty, P., Obeng, A., Müller, S., & Matsuo, A. (2018). quanteda: An R package for the quantitative analysis of textual data. Journal of Open Source Software, 3(30), 774. https://doi.org/10.21105/joss.00774
Kearney, M. (2019). rtweet: Collecting and analyzing Twitter data. Journal of Open Source Software, 4(42), 1829. https://doi.org/10.21105/joss.01829
Lang, D., Chien, G., & Lang, D. (2018). Package “wordcloud2.”
Watanabe, K., & Xuan-Hieu, P. (2022). Package “seededla.”
Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L., François, R., Grolemund, G., Hayes, A., Henry, L., Hester, J., Kuhn, M., Pedersen, T., Miller, E., Bache, S., Müller, K., Ooms, J., Robinson, D., Seidel, D., Spinu, V., … Yutani, H. (2019). Welcome to the Tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686