1 Load Libraries

Hal yang pertama dilakukan adalah membuat library yang dibutuhkan

library(rtweet)
library(quanteda)
library(katadasaR)
library(katadasaR)
library(tidyverse)
library(seededlda)
library(tidytext)
library(quanteda.textstats)
library(quanteda.textplots)
library(wordcloud2)

2 Collecting Tweets

Proses selanjutnya adalah mengambil data twit, saya menggunakan kata kunci “holywings”.

df <- search_tweets("holywings", n = 10000, lang = "id", include_rts = T)

3 Remove Duplicate

Data Twit yangb berupa duplikat akan dibersihkan

df.tweets <- df %>%
  select(full_text) %>%
  na.exclude() %>%
  distinct()

rmarkdown::paged_table(data.frame(head(df.tweets)))

4 Corpus

Saya mengubah dataframe menjadi corpus agar memudahkan dalam proses cleaning.

corpus.tweets <- df.tweets %>%
  corpus(text_field = "full_text")

head(corpus.tweets)

## Corpus consisting of 6 documents.
## text1 :
## "Holywings yang dikenal sebagai usaha restoran, kelab malam, ..."
## 
## text2 :
## "Holywings harus meminta maaf kepada seluruh umat Islam dalam..."
## 
## text3 :
## "GP Ansor Akan Polisikan Holywings Buntut Promo Alkohol untuk..."
## 
## text4 :
## "RT @Lelaki_5uny1: Breaking News....!! Konpres Kapolres Metro..."
## 
## text5 :
## "RT @DPP_LIP: PERNYATAAN SIKAP BERSAMA <U+0001D405><U+0001D40F><U+0001D408>, <U+0001D406><U+0001D40D><U+0001D40F><U+0001D405> <U+0001D414><U+0001D40B><U+0001D400><U+0001D40C><U+0001D400> DAN <U+0001D40F><U+0001D400>..."
## 
## text6 :
## "Polisi Tetapkan 6 Tersangka Staf Holywings, Kena Pasal Penis..."

5 Cleaning

5.1 Removing Symbols, URL, and Numbers

Dikarenakan masih ada beberapa tanda baca yang tersisa, maka saya akan menggunakan fungsi tokens_remove pada chunk selanjutnya.

tokens.tweets <- corpus.tweets %>%
  tokens(remove_symbols = T,
         remove_url = T,
         remove_numbers = T)

head(tokens.tweets)

## Tokens consisting of 6 documents.
## text1 :
##  [1] "Holywings" "yang"      "dikenal"   "sebagai"   "usaha"     "restoran" 
##  [7] ","         "kelab"     "malam"     ","         "dan"       "bar"      
## [ ... and 25 more ]
## 
## text2 :
##  [1] "Holywings" "harus"     "meminta"   "maaf"      "kepada"    "seluruh"  
##  [7] "umat"      "Islam"     "dalam"     "waktu"     "x"         "jam"      
## [ ... and 1 more ]
## 
## text3 :
##  [1] "GP"        "Ansor"     "Akan"      "Polisikan" "Holywings" "Buntut"   
##  [7] "Promo"     "Alkohol"   "untuk"     "Muhammad" 
## 
## text4 :
##  [1] "RT"            "@Lelaki_5uny1" ":"             "Breaking"     
##  [5] "News"          "."             "."             "."            
##  [9] "."             "!"             "!"             "Konpres"      
## [ ... and 14 more ]
## 
## text5 :
##  [1] "RT"         "@DPP_LIP"   ":"          "PERNYATAAN" "SIKAP"     
##  [6] "BERSAMA"    "<U+0001D405><U+0001D40F><U+0001D408>" ","          "<U+0001D406><U+0001D40D><U+0001D40F><U+0001D405>" "<U+0001D414><U+0001D40B><U+0001D400><U+0001D40C><U+0001D400>"
## [11] "DAN"        "<U+0001D40F><U+0001D400>"
## [ ... and 6 more ]
## 
## text6 :
##  [1] "Polisi"      "Tetapkan"    "Tersangka"   "Staf"        "Holywings"  
##  [6] ","           "Kena"        "Pasal"       "Penistaan"   "Agama"      
## [11] "lewat"       "@KanalnewsC"
## [ ... and 1 more ]

5.2 Cleaning Using Regex

Saya menggunakan regular expression untuk membersihkan data teks. Beberapa regular expressions digunakan untuk membersihkan emotikon, invoices, url, carriage return, mention, angka dan lain-lainnya.

tokens.tweets.clean <- tokens.tweets %>%
  tokens_tolower() %>%
  tokens_remove("[:punct:]", valuetype = "regex") %>%
  tokens_remove("inv/[0-9]+/+[xvi]+/[xvi]+/[0-9]+", valuetype = "regex") %>%
  tokens_remove("[^\x01-\x7F]", valuetype = "regex") %>%
  tokens_remove("[0-9]+", valuetype = "regex") %>%
  tokens_remove("⁠[:graph:]", valuetype = "regex") %>%
  tokens_remove("http[^[:space:]]*", valuetype = "regex") %>%
  tokens_remove("[\r\n]", valuetype = "regex") %>%
  tokens_remove("@\\w+", valuetype = "regex")
  
tokens.tweets.clean

## Tokens consisting of 901 documents.
## text1 :
##  [1] "holywings" "yang"      "dikenal"   "sebagai"   "usaha"     "restoran" 
##  [7] "kelab"     "malam"     "dan"       "bar"       "membuat"   "promosi"  
## [ ... and 19 more ]
## 
## text2 :
##  [1] "holywings" "harus"     "meminta"   "maaf"      "kepada"    "seluruh"  
##  [7] "umat"      "islam"     "dalam"     "waktu"     "x"         "jam"      
## 
## text3 :
##  [1] "gp"        "ansor"     "akan"      "polisikan" "holywings" "buntut"   
##  [7] "promo"     "alkohol"   "untuk"     "muhammad" 
## 
## text4 :
##  [1] "rt"       "breaking" "news"     "konpres"  "kapolres" "metro"   
##  [7] "jakarta"  "selatan"  "soal"     "promo"    "alkohol"  "menista" 
## [ ... and 5 more ]
## 
## text5 :
##  [1] "rt"         "pernyataan" "sikap"      "bersama"    "dan"       
##  [6] "tentang"    "penodaan"   "agama"      "oleh"       "pihak"     
## [11] "holywings" 
## 
## text6 :
##  [1] "polisi"    "tetapkan"  "tersangka" "staf"      "holywings" "kena"     
##  [7] "pasal"     "penistaan" "agama"     "lewat"    
## 
## [ reached max_ndoc ... 895 more documents ]

5.3 spelling Correction

Dikarenakan bahasa Indonesia yang digunakan sehari-hari menggunakan ejaan yang non-formal, maka setiap ejaan yang bersifat non-formal akan diformalisasi.

spelling.corrector <- spelling_lexicon <- read.csv(file = "colloquial-indonesian-lexicon.csv", header = T, stringsAsFactors = F)

slang <- spelling.corrector$slang
formal <- spelling.corrector$formal

rmarkdown::paged_table(head(data.frame(spelling.corrector)))

Proses formalisasi ejaan dimulai

tokens.tweets.clean.words <- tokens.tweets.clean %>% 
  tokens_replace(pattern = slang, replacement = formal)

tokens.tweets.clean.words

## Tokens consisting of 901 documents.
## text1 :
##  [1] "holywings" "yang"      "dikenal"   "sebagai"   "usaha"     "restoran" 
##  [7] "kelab"     "malam"     "dan"       "bar"       "membuat"   "promosi"  
## [ ... and 19 more ]
## 
## text2 :
##  [1] "holywings" "harus"     "meminta"   "maaf"      "kepada"    "seluruh"  
##  [7] "umat"      "islam"     "dalam"     "waktu"     "kali"      "jam"      
## 
## text3 :
##  [1] "gp"        "ansor"     "akan"      "polisikan" "holywings" "buntut"   
##  [7] "promo"     "alkohol"   "untuk"     "muhammad" 
## 
## text4 :
##  [1] "rt"       "breaking" "news"     "konpres"  "kapolres" "metro"   
##  [7] "jakarta"  "selatan"  "soal"     "promo"    "alkohol"  "menista" 
## [ ... and 5 more ]
## 
## text5 :
##  [1] "rt"         "pernyataan" "sikap"      "bersama"    "dan"       
##  [6] "tentang"    "penodaan"   "agama"      "oleh"       "pihak"     
## [11] "holywings" 
## 
## text6 :
##  [1] "polisi"    "tetapkan"  "tersangka" "staf"      "holywings" "kena"     
##  [7] "pasal"     "penistaan" "agama"     "lewat"    
## 
## [ reached max_ndoc ... 895 more documents ]

5.4 Stemming

Stemming adalah mengubah kata imbuhan menjadi kata dasar.

stemming <- function(x){
  paste(lapply(x,katadasar), collapse = " ")}

tokens.tweets.clean.words <- tokens(lapply(tokens(tokens.tweets.clean.words), stemming))
tokens.tweets.clean.words <- tokens(tokenize_fasterword((tokens.tweets.clean.words)))

tokens.tweets.clean.words

## Tokens consisting of 901 documents.
## text1 :
##  [1] "holywings" "yang"      "kenal"     "bagai"     "usaha"     "restoran" 
##  [7] "kelab"     "malam"     "dan"       "bar"       "buat"      "promosi"  
## [ ... and 19 more ]
## 
## text2 :
##  [1] "holywings" "harus"     "minta"     "maaf"      "kepada"    "seluruh"  
##  [7] "umat"      "islam"     "dalam"     "waktu"     "kali"      "jam"      
## 
## text3 :
##  [1] "gp"        "ansor"     "akan"      "polis"     "holywings" "buntut"   
##  [7] "promo"     "alkohol"   "untuk"     "muhammad" 
## 
## text4 :
##  [1] "rt"       "breaking" "news"     "konpres"  "kapolres" "metro"   
##  [7] "jakarta"  "selatan"  "soal"     "promo"    "alkohol"  "nista"   
## [ ... and 5 more ]
## 
## text5 :
##  [1] "rt"        "nyata"     "sikap"     "sama"      "dan"       "tentang"  
##  [7] "noda"      "agama"     "oleh"      "pihak"     "holywings"
## 
## text6 :
##  [1] "polisi"    "tetap"     "sangka"    "staf"      "holywings" "kena"     
##  [7] "pasal"     "nista"     "agama"     "lewat"    
## 
## [ reached max_ndoc ... 895 more documents ]

5.5 Stopwords Removal

Stopwords adalah kata berulang dan juga kata-kata yang tidak bermakna dalam proses analisis.

stopwords.id <- read.table("stopwords.txt") %>%
  as.character()

head(stopwords.id)

## [1] "ada"     "adalah"  "adanya"  "adapun"  "agak"    "agaknya"

Proses stopwords removal dimulai.

tokens.tweets.clean.words <- tokens.tweets.clean.words %>%
  tokens_remove(stopwords.id)
tokens.tweets.clean.words

## Tokens consisting of 901 documents.
## text1 :
##  [1] "holywings" "kenal"     "usaha"     "restoran"  "kelab"     "malam"    
##  [7] "bar"       "promosi"   "minum"     "alkohol"   "gratis"    "unjung"   
## [ ... and 9 more ]
## 
## text2 :
## [1] "holywings" "maaf"      "umat"      "islam"     "kali"      "jam"      
## 
## text3 :
## [1] "gp"        "ansor"     "polis"     "holywings" "buntut"    "promo"    
## [7] "alkohol"   "muhammad" 
## 
## text4 :
##  [1] "breaking"  "news"      "konpres"   "kapolres"  "metro"     "jakarta"  
##  [7] "selatan"   "promo"     "alkohol"   "nista"     "agama"     "holywings"
## [ ... and 2 more ]
## 
## text5 :
## [1] "nyata"     "sikap"     "noda"      "agama"     "holywings"
## 
## text6 :
## [1] "polisi"    "sangka"    "staf"      "holywings" "kena"      "pasal"    
## [7] "nista"     "agama"    
## 
## [ reached max_ndoc ... 895 more documents ]

6 Document-Feature Matrix

Document-feature matrix adalah matrix yang berisi fitur atau kata sebagai kolom dan dokumen sebagai baris.

dfm <- tokens.tweets.clean.words %>%
  dfm()
dfm

## Document-feature matrix of: 901 documents, 2,137 features (99.69% sparse) and 0 docvars.
##        features
## docs    holywings kenal usaha restoran kelab malam bar promosi minum alkohol
##   text1         1     1     1        1     1     1   1       2     1       1
##   text2         1     0     0        0     0     0   0       0     0       0
##   text3         1     0     0        0     0     0   0       0     0       1
##   text4         1     0     0        0     0     0   0       0     0       1
##   text5         1     0     0        0     0     0   0       0     0       0
##   text6         1     0     0        0     0     0   0       0     0       0
## [ reached max_ndoc ... 895 more documents, reached max_nfeat ... 2,127 more features ]

7 Trim DFM

Data DFM dipotong agar tidak ada kata yang terlalu mendominasi pada wordcloud dan topic modelling.

dfm.trimmed <- dfm %>%
  dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
           max_docfreq = 0.1, docfreq_type = "prop")

head(dfm.trimmed)

## Document-feature matrix of: 6 documents, 430 features (98.41% sparse) and 0 docvars.
##        features
## docs    usaha restoran malam bar promosi minum alkohol gratis nama muhammad
##   text1     1        1     1   1       2     1       1      1    1        1
##   text2     0        0     0   0       0     0       0      0    0        0
##   text3     0        0     0   0       0     0       1      0    0        1
##   text4     0        0     0   0       0     0       1      0    0        0
##   text5     0        0     0   0       0     0       0      0    0        0
##   text6     0        0     0   0       0     0       0      0    0        0
## [ reached max_nfeat ... 420 more features ]

8 Topic Modelling

Proses pemodelan topik dimulai

tm.lda <- textmodel_lda(dfm.trimmed, k = 9)
data.frame(terms(tm.lda, 10))

9 Plot Topic Model

Selanjutnya adalah membuat plot topic model dengan ggplot2

topwords.lda <- setNames(reshape2::melt(tm.lda$phi),c("topic","term","beta")) %>% 
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

topwords.lda[nchar(as.character(topwords.lda$term))>=3 & topwords.lda$topic!="other",] %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = F) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

10 Wordcloud

Wordcloud merupakan kumpulan kata yang paling sering muncul pada data atau corpora.

# wordcloud
word_frequency <- dfm %>% 
  textstat_frequency(n = 100) %>%
  as.data.frame()

wordcloud2(word_frequency, size = 7, minRotation = -pi/6, maxRotation = -pi/6,
           rotateRatio = 0)

11 Feature Co-occurrence Matrix

fcm.tweets <- fcm(dfm.trimmed)

feat <- fcm.tweets %>% 
  topfeatures(100) %>%
  names()

fcm_selected <- fcm_select(fcm.tweets, pattern = feat, selection = "keep")

dim(fcm_selected)

## [1] 100 100

topfeatures(fcm_selected)

##   people     holy  berlaku   syarat giveaway    penuh     ecek    dasar 
##      199      192      173      160      157      148      134      128 
##     info    event 
##      119      106

12 Text Network

Text Network adalah plot yang menunjukkan hubungan antar setiap kata pada setiap unit analisis atau dokumen.

size <- log(colSums(dfm_select(dfm, feat, selection = "keep")))

set.seed(144)
textplot_network(fcm_selected, min_freq = 0.3, vertex_size = size / max(size) * 3)

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

13 Referensi

Benoit, K., Watanabe, K., Wang, H., Lua, J. W., & Kuha, J. (2021). Package “quanteda. textstats.” Research Bulletin, 27(2), 37–54. Benoit, K., Watanabe, K., Wang, H., Nulty, P., Obeng, A., Müller, S., & Matsuo, A. (2018). quanteda: An R package for the quantitative analysis of textual data. Journal of Open Source Software, 3(30), 774. https://doi.org/10.21105/joss.00774 Kearney, M. (2019). rtweet: Collecting and analyzing Twitter data. Journal of Open Source Software, 4(42), 1829. https://doi.org/10.21105/joss.01829 Lang, D., Chien, G., & Lang, D. (2018). Package “wordcloud2.” Watanabe, K., & Xuan-Hieu, P. (2022). Package “seededla.” Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L., François, R., Grolemund, G., Hayes, A., Henry, L., Hester, J., Kuhn, M., Pedersen, T., Miller, E., Bache, S., Müller, K., Ooms, J., Robinson, D., Seidel, D., Spinu, V., … Yutani, H. (2019). Welcome to the Tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686

Topic Modeling, Wordcloud, dan Text Network Twitter

Fajar Khaswara

24 Juni 2022