Analisis clustering adalah proses pengelompokkan data atau objek kedalam kelompok atau klaster berdasarkan kesamaan karakteristik tertentu. Tujuan utama dari analisis clustering adalah untuk mengidentifikasi struktur tersembunyi dalam data dan mengelompokkan objek yang serupa ke dalam claster yang sama. Tekhnik ini sangat berguna dalam pengelompokkan data, pengurangan dimensi, dan pemahaman pola yang mungkin sulit ditemukan secara manual. Tujuan Analisis clustering:
Pemahaman struktur data : Analisis clustering membantu dalam memehami struktur intrinsik dalam data, membantu mengidentifikasi pola atau kelompok yang mungkin sulit ditemukan secara manual.
Segmentasi data : Dengan mengelompokkan data ke dalam klaster, kita dapat membuat segmentasi yang lebih baik dalam pemahaman karakteristik dan perilaku suatu objek dalam set data.
Tahapan dalam melakukan Analisis clustering
library(wordcloud) #menciptakan visualisasi awan kata(word cloud) dari sebuah teks.
## Loading required package: RColorBrewer
library(tm) #Melakukan penambangan, teks termasuk pembersihan teks, pengindeksan, dan representasi dokumen.
## Loading required package: NLP
library(textclean) #Membersihkan teks dari karakter atau pola tertentu.
library(tidytext) #Memanipulasi dan menganalisis data teks dengan memanfaatkan struktur data tidy.
library(ggplot2) #Membuat grafik dengan menggunakan pendekatan berbasis lapisan(layered) dalam bahasa R.
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(parallel) #Mempercepat beberapa operasi.
library(tokenizers) #Membagi teks menjadi token.
library(tau) #Paket analisis teks untuk bahasa R.
library(NLP) #Memproses dan menganalisis teks dalam bahasa R.
library(stringr) #Memanipulasi dan melakukan analisis string di R.
library(devtools) #Memfasilitasi pengembangan paket di R.
## Loading required package: usethis
library(quanteda) #Menganalisis teks kuantitatif.
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "pcorMatrix" of class "replValueSp"; definition not updated
## Warning in stringi::stri_info(): Your current locale is not in the list of
## available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Warning in stringi::stri_info(): Your current locale is not in the list of
## available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Package version: 3.3.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 2 of 2 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
library(kayadata)
library(syuzhet) #Alat analisis sentimen.
library(e1071) #Menyediakan implementasi yang kuat untuk SVM(Support Vector Machiness) dan algoritma dalam statistik.
library(sentimentr) #Paket analisis sentimen di R.
##
## Attaching package: 'sentimentr'
## The following object is masked from 'package:syuzhet':
##
## get_sentences
library(SentimentAnalysis) #Alat analisis sentimen di R.
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
library(dplyr) #Memanipulasi dan menggabungkan data frame dengan cara yang bersih dan efisien.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pacman) #Mengurai kata-kata dalam teks menjadi bentuk dasar (stemming).
pacman::p_load_gh("trinker/textstem")
pacman::p_load(textstem, dplyr)
set.seed(1234)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:koRpus':
##
## tokenize
## The following object is masked from 'package:tau':
##
## tokenize
GOLKAR <- read_csv("C:/Users/ASUS/Documents/SEMESTER 5/KOMPSTAT LANJUT/GOLKAR.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 180 Columns: 1
## ââ Column specification ââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
## Delimiter: ","
## chr (1): full_text
##
## âı Use `spec()` to retrieve the full column specification for this data.
## âı Specify the column types or set `show_col_types = FALSE` to quiet this message.
rev <- GOLKAR$full_text
head(rev)
## [1] "SUDAH DAPAT DI TEBAK ENDING DARI SEMUA INI, LIHAT BEBERAPA TAHUN KEDEPAN KALAU BENERAN TERPILIH.. https://t.co/K9quBe55yj"
## [2] "gw tau politik tuh dinamis, tapi yang ini beneran NGEBINGUNGIN"
## [3] "Bapaknya setuju gak nih?"
## [4] "Politik dinasti selangkah lagi"
## [5] "Politik dinasti is real kah?"
## [6] "Jadi gibran udah ga di partai banteng lagi, pindah ke beringin atau gimana?"
Text Cleaning
Text cleaning adalah proses untuk memperbaiki dan membersihkan data dari kesalahan, ketidakakuratan dan noise.
set.seed(123)
rev <- tolower(rev) #lower case (mengubah huruf kapital menjadi huruf kecil)
rev <- replace_contraction(rev) #Mapping (Mengembalikan Kata yang disingkat Menjadi Kata Aslinya)
rev <- replace_word_elongation(rev) #Mengembalikan Kata yang Mengalami Perpanjangan Menjadi Kata Aslinya
rev <- strip(rev) #Menghapus simbol
rev <- stem_strings(rev) #Proses stemming yang dilakukan pada keseluruhan string atau dokumen
rev <- lemmatize_strings(rev) #Proses lemmatisasi yang di lakukan pada keseluruhan string atau dokumen
sc<-c("driver","drive","drove","driven","drives","driving")
stem_words(sc) #Proses stemming yang dilakukan pada setiap kata secara terpisah dalam suatu teks
## [1] "driver" "drive" "drove" "driven" "drive" "drive"
lemmatize_words(sc) #Proses lemmatisasi yang dilakukan pada setiap kata secara terpisah dalam suatu teks
## [1] "driver" "drive" "drive" "drive" "drive" "drive"
head(rev)
## [1] "sudah dapat di tebak end dari semua ini lihat beberapa tahun kedepan kalau beneran terpilih httpstcok qube yj"
## [2] "gw tau politik tuh dinami tapi yang ini beneran ngebingungin"
## [3] "bapaknya setuju gak nih"
## [4] "politik dinasti selangkah lagi"
## [5] "politik dinasti i real kah"
## [6] "jadi gibran udah ga di partai banteng lagi pindah ke beringin atau gimana"
rev <- rev %>% #Tokensasi (Metode untuk melakukan pemisahan kata dalam suatu kalimat)
rev()
#Menghapus kata penghubung
rev <-removeWords(rev, c("ternyata","buat","yakin","red","sdh","ikut","tahun","milih","emang","bye","ibu","abi","gimana","gitu","hal","kasian","sah","klo","nih","atau","deh","pen","sama","menurut","sudah","kemaren","keluarga","bukan","tapi","lagi","apa","semua","kayak","kalian","saja","untuk","lebih","baru","belum","jauh","bapak","amin","jika","capr","akan","jurkam","usung","dengan","sendirikan","beneran","pengalaman","wkwk","blunder","mega","haru","banget","orang","kalau","masih","saya","kamil","pilih","kok","lah","tapi","sih","mau","udah","punya","bisa","anak","aja","cawapr","besar","kalo","nya","yang","you","ada","itu","dan","dari","gak","pak","dia","lie","ini","jadi","httpstco","banyak","bakalan","banyak","solo","kan","pemilik","putaran","bener","selama","jangan","kemarin","kali","juga","tiba","sendiri","siapa","katanya","bikin","bocil","bakal","pasti","tinggal","bagu","agar","kan","bakalan","banyak","pernah","ma","yg","di","ga","gw","jd","ke","mk","tp"))
Membuat wordcloud
Wordcloud adalah representasi visual dari data teks. Wordcloud dibuat dengan cara menghitung frekuwnsi kemunculan setiap kata dalam teks, kemudian menampilkan kata-kata tersebut dengan ukuran yang proposional dengan frekuensinya. Kata-kata yang sering muncul akan ditampilkan dengan ukuran yang lebih besar, sedangkan kata-kata yang jarang muncul akan ditampilkan dengan ukuran yang lebih kecil.
#Mengubah Data Frame Menjadi Data Faktor
tdm <- TermDocumentMatrix(rev)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing = TRUE)
#Mengubah Data Faktor Menjadi Data Frame
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq,
random.order = FALSE,
max.words = 80,
colors = brewer.pal(name = "Dark2",8 ))
############################
tdm <- TermDocumentMatrix(rev,
control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 873, documents: 180)>>
## Non-/sparse entries: 1131/156009
## Sparsity : 99%
## Maximal term length: 96
## Weighting : term frequency (tf)
# inspect frequent words (Memeriksa kata yang sering muncul)
(freq.terms <- findFreqTerms(tdm, lowfreq = 5))
## [1] "golkar" "partai" "gibran" "pdip" "prabowo"
## [6] "kader" "dinasti" "politik" "gibrantweet" "jokowi"
## [11] "kekuasaan" "kalah"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 5)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()
# remove sparse terms(menghapus kata yang jarang muncul)
tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
m2
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## golkar 1 0 0 1 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 1 0 0 0
## partai 1 0 0 0 0 0 0 1 0 0 2 0 2 0 0 0 2 3 0 0 0 0 0 0 0 0
## gibran 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0
## prabowo 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 2 1 0 0 0 4 0 0 0 0 0 0 0 0 0 1 0 0 0
## dinasti 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## politik 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## golkar 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## partai 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## gibran 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## prabowo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## dinasti 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## politik 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## golkar 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## partai 0 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 0 0 0
## gibran 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## prabowo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## kader 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## dinasti 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## politik 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## Docs
## Terms 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## golkar 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 0
## partai 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0
## gibran 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
## prabowo 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1
## kader 0 2 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
## dinasti 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## politik 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
## Docs
## Terms 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## golkar 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## partai 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gibran 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## prabowo 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## dinasti 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0
## politik 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
## golkar 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## partai 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gibran 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## prabowo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## dinasti 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## politik 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## Docs
## Terms 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
## golkar 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## partai 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
## gibran 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## prabowo 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## dinasti 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## politik 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## Docs
## Terms 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
## golkar 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## partai 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gibran 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## prabowo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## dinasti 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## politik 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## Docs
## Terms 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## golkar 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## partai 0 0 0 0 0 0 0 0 3 0 1 0 0 0 0 0
## gibran 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## prabowo 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## kader 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## dinasti 1 0 0 0 1 1 1 0 0 1 0 1 1 0 0 0
## politik 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit)
rect.hclust(fit, k = 4) # cut tree into 4 clusters
m3 <- t(m2) # transpose the matrix to cluster documents (tweets)
set.seed(122) # set a fixed random seed
k <- 2 # number of clusters
kmeansResult <- kmeans(m3, k)
round(kmeansResult$centers, digits = 3) # cluster centers
## golkar partai gibran prabowo kader dinasti politik
## 1 0.095 0.048 0.238 0.095 0.000 0.667 0.857
## 2 0.075 0.157 0.082 0.063 0.101 0.000 0.000
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep = ""))
s <- sort(kmeansResult$centers[i, ], decreasing = T)
cat(names(s)[1:5], "\n")
# print the tweets of every cluster
# print(tweets[which(kmeansResult£cluster==i)])
}
## cluster 1: politik dinasti gibran golkar prabowo
## cluster 2: partai kader gibran golkar prabowo