1 Load Libraries

Hal yang pertama dilakukan adalah membuat library yang dibutuhkan

library(rtweet)
library(quanteda)
library(katadasaR)
library(katadasaR)
library(tidyverse)
library(seededlda)
library(tidytext)
library(quanteda.textstats)
library(quanteda.textplots)
library(wordcloud2)

2 Collecting Tweets

Proses selanjutnya adalah mengambil data twit, saya menggunakan kata kunci “#BapakPolitikIdentitas”.

df <- search_tweets("#BapakPolitikIdentitas", n = 10000, lang = "id", include_rts = T)

3 Remove Duplicate

Data Twit yangb berupa duplikat akan dibersihkan

df.tweets <- df %>%
  select(full_text) %>%
  na.exclude() %>%
  distinct()

rmarkdown::paged_table(data.frame(head(df.tweets)))

4 Corpus

Saya mengubah dataframe menjadi corpus agar memudahkan dalam proses cleaning.

corpus.tweets <- df.tweets %>%
  corpus(text_field = "full_text")

head(corpus.tweets)

## Corpus consisting of 6 documents.
## text1 :
## "RT @MahendraXY: Satu geng nih <U+0001F60F> #BapakPolitikIdentitas https..."
## 
## text2 :
## "@YRadianto gaspoool!!jangan kasi kendor ...jejak digital #Ba..."
## 
## text3 :
## "Hanya orang bodoh yg diam saja dikibuli #BapakPolitikIdentit..."
## 
## text4 :
## "Jejak Digital  #BapakPolitikIdentitas https://t.co/eOVkkN7mm..."
## 
## text5 :
## "@Mdy_Asmara1701 Secara gak langsung elu mengakui kalo junjun..."
## 
## text6 :
## "Sah, Ahok Ditetapkan Netizen Sebagai Bapak Politik Identitas..."

5 Cleaning

5.1 Removing Symbols, URL, and Numbers

Dikarenakan masih ada beberapa tanda baca yang tersisa, maka saya akan menggunakan fungsi tokens_remove pada chunk selanjutnya.

tokens.tweets <- corpus.tweets %>%
  tokens(remove_symbols = T,
         remove_url = T,
         remove_numbers = T)

head(tokens.tweets)

## Tokens consisting of 6 documents.
## text1 :
## [1] "RT"                     "@MahendraXY"            ":"                     
## [4] "Satu"                   "geng"                   "nih"                   
## [7] "#BapakPolitikIdentitas"
## 
## text2 :
##  [1] "@YRadianto" "gaspoool"   "!"          "!"          "jangan"    
##  [6] "kasi"       "kendor"     "."          "."          "."         
## [11] "jejak"      "digital"   
## [ ... and 5 more ]
## 
## text3 :
##  [1] "Hanya"                  "orang"                  "bodoh"                 
##  [4] "yg"                     "diam"                   "saja"                  
##  [7] "dikibuli"               "#BapakPolitikIdentitas" "berulang"              
## [10] "kali"                   "."                     
## 
## text4 :
## [1] "Jejak"                  "Digital"                "#BapakPolitikIdentitas"
## 
## text5 :
##  [1] "@Mdy_Asmara1701"        "Secara"                 "gak"                   
##  [4] "langsung"               "elu"                    "mengakui"              
##  [7] "kalo"                   "junjungan"              "lu"                    
## [10] "#BapakPolitikIdentitas" "."                      "Intinya"               
## [ ... and 6 more ]
## 
## text6 :
##  [1] "Sah"                    ","                      "Ahok"                  
##  [4] "Ditetapkan"             "Netizen"                "Sebagai"               
##  [7] "Bapak"                  "Politik"                "Identitas"             
## [10] ","                      "Tagar"                  "#BapakPolitikIdentitas"
## [ ... and 9 more ]

5.2 Cleaning Using Regex

Saya menggunakan regular expression untuk membersihkan data teks. Beberapa regular expressions digunakan untuk membersihkan emotikon, invoices, url, carriage return, mention, angka dan lain-lainnya.

tokens.tweets.clean <- tokens.tweets %>%
  tokens_tolower() %>%
  tokens_remove("[:punct:]", valuetype = "regex") %>%
  tokens_remove("inv/[0-9]+/+[xvi]+/[xvi]+/[0-9]+", valuetype = "regex") %>%
  tokens_remove("[^\x01-\x7F]", valuetype = "regex") %>%
  tokens_remove("[0-9]+", valuetype = "regex") %>%
  tokens_remove("⁠[:graph:]", valuetype = "regex") %>%
  tokens_remove("http[^[:space:]]*", valuetype = "regex") %>%
  tokens_remove("[\r\n]", valuetype = "regex") %>%
  tokens_remove("@\\w+", valuetype = "regex")
  
tokens.tweets.clean

## Tokens consisting of 2,583 documents.
## text1 :
## [1] "rt"   "satu" "geng" "nih" 
## 
## text2 :
## [1] "gaspoool"   "jangan"     "kasi"       "kendor"     "jejak"     
## [6] "digital"    "bertebaran"
## 
## text3 :
## [1] "hanya"    "orang"    "bodoh"    "yg"       "diam"     "saja"     "dikibuli"
## [8] "berulang" "kali"    
## 
## text4 :
## [1] "jejak"   "digital"
## 
## text5 :
##  [1] "secara"    "gak"       "langsung"  "elu"       "mengakui"  "kalo"     
##  [7] "junjungan" "lu"        "intinya"   "mah"       "gak"       "mau"      
## [ ... and 2 more ]
## 
## text6 :
##  [1] "sah"        "ahok"       "ditetapkan" "netizen"    "sebagai"   
##  [6] "bapak"      "politik"    "identitas"  "tagar"      "menggema"  
## [11] "di"         "twitter"   
## [ ... and 5 more ]
## 
## [ reached max_ndoc ... 2,577 more documents ]

5.3 Spelling Correction

Dikarenakan bahasa Indonesia yang digunakan sehari-hari menggunakan ejaan yang non-formal, maka setiap ejaan yang bersifat non-formal akan diformalisasi.

spelling.corrector <- spelling_lexicon <- read.csv(file = "colloquial-indonesian-lexicon.csv", header = T, stringsAsFactors = F)

slang <- spelling.corrector$slang
formal <- spelling.corrector$formal

rmarkdown::paged_table(head(data.frame(spelling.corrector)))

Proses formalisasi ejaan dimulai

tokens.tweets.clean.words <- tokens.tweets.clean %>% 
  tokens_replace(pattern = slang, replacement = formal)

tokens.tweets.clean.words

## Tokens consisting of 2,583 documents.
## text1 :
## [1] "rt"   "satu" "geng" "nih" 
## 
## text2 :
## [1] "gaspoool"   "jangan"     "kasih"      "kendor"     "jejak"     
## [6] "digital"    "bertebaran"
## 
## text3 :
## [1] "hanya"    "orang"    "bodoh"    "yang"     "diam"     "saja"     "dikibuli"
## [8] "berulang" "kali"    
## 
## text4 :
## [1] "jejak"   "digital"
## 
## text5 :
##  [1] "secara"    "enggak"    "langsung"  "lu"        "mengakui"  "kalo"     
##  [7] "junjungan" "lu"        "intinya"   "mah"       "enggak"    "mau"      
## [ ... and 2 more ]
## 
## text6 :
##  [1] "sah"        "ahok"       "ditetapkan" "netizen"    "sebagai"   
##  [6] "bapak"      "politik"    "identitas"  "tagar"      "menggema"  
## [11] "di"         "twitter"   
## [ ... and 5 more ]
## 
## [ reached max_ndoc ... 2,577 more documents ]

5.4 Stemming

Stemming adalah mengubah kata imbuhan menjadi kata dasar.

stemming <- function(x){
  paste(lapply(x,katadasar), collapse = " ")}

tokens.tweets.clean.words <- tokens(lapply(tokens(tokens.tweets.clean.words), stemming))
tokens.tweets.clean.words <- tokens(tokenize_fasterword((tokens.tweets.clean.words)))

tokens.tweets.clean.words

## Tokens consisting of 2,583 documents.
## text1 :
## [1] "rt"   "satu" "geng" "nih" 
## 
## text2 :
## [1] "gaspoool" "jangan"   "kasih"    "kendor"   "jejak"    "digital"  "tebar"   
## 
## text3 :
## [1] "hanya" "orang" "bodoh" "yang"  "diam"  "saja"  "kibul" "ulang" "kali" 
## 
## text4 :
## [1] "jejak"   "digital"
## 
## text5 :
##  [1] "cara"     "enggak"   "langsung" "lu"       "aku"      "kalo"    
##  [7] "junjung"  "lu"       "inti"     "mah"      "enggak"   "mau"     
## [ ... and 2 more ]
## 
## text6 :
##  [1] "sah"       "ahok"      "tetap"     "netizen"   "bagai"     "bapak"    
##  [7] "politik"   "identitas" "tagar"     "gema"      "di"        "twitter"  
## [ ... and 5 more ]
## 
## [ reached max_ndoc ... 2,577 more documents ]

5.5 Stopwords Removal

Stopwords adalah kata berulang dan juga kata-kata yang tidak bermakna dalam proses analisis.

stopwords.id <- read.table("stopwords.txt") %>%
  as.character()

head(stopwords.id)

## [1] "ada"     "adalah"  "adanya"  "adapun"  "agak"    "agaknya"

Proses stopwords removal dimulai.

tokens.tweets.clean.words <- tokens.tweets.clean.words %>%
  tokens_remove(stopwords.id)
tokens.tweets.clean.words

## Tokens consisting of 2,583 documents.
## text1 :
## [1] "geng"
## 
## text2 :
## [1] "gaspoool" "kasih"    "kendor"   "jejak"    "digital"  "tebar"   
## 
## text3 :
## [1] "orang" "bodoh" "diam"  "kibul" "ulang" "kali" 
## 
## text4 :
## [1] "jejak"   "digital"
## 
## text5 :
## [1] "langsung" "junjung"  "inti"     "mah"      "kotor"   
## 
## text6 :
## [1] "sah"       "ahok"      "netizen"   "politik"   "identitas" "tagar"    
## [7] "gema"      "twitter"   "hoax"     
## 
## [ reached max_ndoc ... 2,577 more documents ]

6 Document-Feature Matrix

Document-feature matrix adalah matrix yang berisi fitur atau kata sebagai kolom dan dokumen sebagai baris.

dfm <- tokens.tweets.clean.words %>%
  dfm()
dfm

## Document-feature matrix of: 2,583 documents, 3,484 features (99.80% sparse) and 0 docvars.
##        features
## docs    geng gaspoool kasih kendor jejak digital tebar orang bodoh diam
##   text1    1        0     0      0     0       0     0     0     0    0
##   text2    0        1     1      1     1       1     1     0     0    0
##   text3    0        0     0      0     0       0     0     1     1    1
##   text4    0        0     0      0     1       1     0     0     0    0
##   text5    0        0     0      0     0       0     0     0     0    0
##   text6    0        0     0      0     0       0     0     0     0    0
## [ reached max_ndoc ... 2,577 more documents, reached max_nfeat ... 3,474 more features ]

7 Trim DFM

Data DFM dipotong agar tidak ada kata yang terlalu mendominasi pada wordcloud dan topic modelling.

dfm.trimmed <- dfm %>%
  dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
           max_docfreq = 0.1, docfreq_type = "prop")

head(dfm.trimmed)

## Document-feature matrix of: 6 documents, 745 features (99.49% sparse) and 0 docvars.
##        features
## docs    kasih jejak digital tebar orang bodoh diam ulang kali langsung
##   text1     0     0       0     0     0     0    0     0    0        0
##   text2     1     1       1     1     0     0    0     0    0        0
##   text3     0     0       0     0     1     1    1     1    1        0
##   text4     0     1       1     0     0     0    0     0    0        0
##   text5     0     0       0     0     0     0    0     0    0        1
##   text6     0     0       0     0     0     0    0     0    0        0
## [ reached max_nfeat ... 735 more features ]

8 Topic Modelling

Proses pemodelan topik dimulai

tm.lda <- textmodel_lda(dfm.trimmed, k = 9)
data.frame(terms(tm.lda, 10))

##       topic1       topic2    topic3    topic4  topic5     topic6 topic7
## 1    jakarta          the     jejak identitas   warga  indonesia  orang
## 2    selamat        hasil   digital       dki    nama   baswedan    wan
## 3  holywings putrasiregar      lupa     agama   jalan       anis   ahok
## 4    mandiri      selamat    dukung   pilkada   ganti  identitas  lihat
## 5      cinta      jakarta     tolak  gubernur   kerja       ahok  nyata
## 6      papua       jokowi      cuci      nkri   bikin      tagar  gelar
## 7      maria         adha identitas    menang   susah    netizen  karya
## 8       widi         idul    buzzer    bangsa jakarta   trending  makna
## 9    alphard      litbang     lawan      ayat bahagia       alam  pakai
## 10     chris      alphard     hapus     paham     dki indentitas  omong
##       topic8   topic9
## 1       sang    pilih
## 2     cebong   dukung
## 3      unpad   nasdem
## 4     kadrun      amp
## 5     sampah presiden
## 6     tukang   partai
## 7     ngibul   capres
## 8      peluk   rakyat
## 9    sabrina  tinggal
## 10 carpenter     biar

9 Plot Topic Model

Selanjutnya adalah membuat plot topic model dengan ggplot2

topwords.lda <- setNames(reshape2::melt(tm.lda$phi),c("topic","term","beta")) %>% 
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

topwords.lda[nchar(as.character(topwords.lda$term))>=3 & topwords.lda$topic!="other",] %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = F) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

10 Wordcloud

Wordcloud merupakan kumpulan kata yang paling sering muncul pada data atau corpora.

# wordcloud
word_frequency <- dfm %>% 
  textstat_frequency(n = 100) %>%
  as.data.frame()

wordcloud2(word_frequency, size = 1, minRotation = -pi/6, maxRotation = -pi/6,
           rotateRatio = 0)

11 Feature Co-occurrence Matrix

fcm.tweets <- fcm(dfm.trimmed)

feat <- fcm.tweets %>% 
  topfeatures(100) %>%
  names()

fcm_selected <- fcm_select(fcm.tweets, pattern = feat, selection = "keep")

dim(fcm_selected)

## [1] 100 100

topfeatures(fcm_selected)

##    indonesia         adha      mandiri         show         idul        dream 
##         2026         1332         1301         1269         1248         1213 
## putrasiregar        laura        cinta        pogba 
##         1051         1021          958          947

12 Text Network

Text Network adalah plot yang menunjukkan hubungan antar setiap kata pada setiap unit analisis atau dokumen.

size <- log(colSums(dfm_select(dfm, feat, selection = "keep")))

set.seed(144)
textplot_network(fcm_selected, min_freq = 0.5, vertex_size = size / max(size) * 3)

## Warning: ggrepel: 39 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

13 Referensi

Benoit, K., Watanabe, K., Wang, H., Lua, J. W., & Kuha, J. (2021). Package “quanteda. textstats.” Research Bulletin, 27(2), 37–54.

Benoit, K., Watanabe, K., Wang, H., Nulty, P., Obeng, A., Müller, S., & Matsuo, A. (2018). quanteda: An R package for the quantitative analysis of textual data. Journal of Open Source Software, 3(30), 774. https://doi.org/10.21105/joss.00774

Kearney, M. (2019). rtweet: Collecting and analyzing Twitter data. Journal of Open Source Software, 4(42), 1829. https://doi.org/10.21105/joss.01829

Lang, D., Chien, G., & Lang, D. (2018). Package “wordcloud2.”

Watanabe, K., & Xuan-Hieu, P. (2022). Package “seededla.”

Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L., François, R., Grolemund, G., Hayes, A., Henry, L., Hester, J., Kuhn, M., Pedersen, T., Miller, E., Bache, S., Müller, K., Ooms, J., Robinson, D., Seidel, D., Spinu, V., … Yutani, H. (2019). Welcome to the Tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686

Topic Modeling, Wordcloud, dan Text Network Twitter #BapakPolitikIdentitas

Fajar Khaswara

25 Juni 2022