#========================================================
# ANALISIS SENTIMEN ULASAN ROBLOX
#========================================================

#========================================================
# 1. INSTALL PACKAGE
#========================================================

# install.packages("readxl")
# install.packages("tm")
# install.packages("SnowballC")
# install.packages("dplyr")
# install.packages("caret")
# install.packages("e1071")
# install.packages("ggplot2")
# install.packages("tokenizers")
# install.packages("knitr")
# install.packages("ggwordcloud")

#========================================================
# 2. LOAD LIBRARY
#========================================================

library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(tm)
## Warning: package 'tm' was built under R version 4.4.3
## Loading required package: NLP
library(SnowballC)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
library(ggplot2)
library(tokenizers)
## Warning: package 'tokenizers' was built under R version 4.4.3
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
library(ggwordcloud)
## Warning: package 'ggwordcloud' was built under R version 4.4.3
#========================================================
# 3. IMPORT DATA
#========================================================

roblox <- read_excel(
  "D:/Universitas Negeri Padang/Semester 6/Data Mining/Tugas Data Text/Data Ulasan Roblox.xlsx"
)

#========================================================
# 4. MEMILIH KOLOM
#========================================================

roblox <- roblox %>%
  select(ulasan, rating)

#========================================================
# 5. MENGHAPUS DATA KOSONG
#========================================================

roblox <- roblox %>%
  filter(!is.na(ulasan))

#========================================================
# 6. PELABELAN SENTIMEN
#========================================================

roblox$sentimen <- ifelse(
  roblox$rating >= 4,
  "positif",
  ifelse(
    roblox$rating <= 2,
    "negatif",
    NA
  )
)

# Menghapus data netral
roblox <- na.omit(roblox)

#========================================================
# 7. MEMBUAT CORPUS
#========================================================

corpus <- VCorpus(
  VectorSource(roblox$ulasan)
)

#========================================================
# 8. CASE FOLDING
#========================================================

corpus <- tm_map(
  corpus,
  content_transformer(tolower)
)

#========================================================
# 9. MENGHAPUS TANDA BACA
#========================================================

corpus <- tm_map(
  corpus,
  removePunctuation
)

#========================================================
# 10. MENGHAPUS ANGKA
#========================================================

corpus <- tm_map(
  corpus,
  removeNumbers
)

#========================================================
# 11. MENGHAPUS KARAKTER KHUSUS
#========================================================

corpus <- tm_map(
  corpus,
  content_transformer(function(x){
    
    gsub(
      "[^[:alnum:][:space:]]",
      " ",
      x
    )
    
  })
)

#========================================================
# 12. MENGHAPUS SPASI BERLEBIH
#========================================================

corpus <- tm_map(
  corpus,
  stripWhitespace
)

#========================================================
# 13. MENGHAPUS STOPWORD
#========================================================

stop_id <- c(
  "yang","dan","di","ke","dari",
  "untuk","pada","dengan","adalah",
  "itu","ini","karena","jadi",
  "saya","aku","nya","atau",
  "dalam","tidak","ada","sudah",
  "agar","lebih","bisa","sangat"
)

corpus <- tm_map(
  corpus,
  removeWords,
  stop_id
)

#========================================================
# 14. STEMMING
#========================================================

corpus <- tm_map(
  corpus,
  stemDocument
)

#========================================================
# 15. HASIL PREPROCESSING
#========================================================

cleaned_text <- sapply(
  corpus,
  as.character
)

#========================================================
# 16. TOKENISASI
#========================================================

tokens <- tokenize_words(
  cleaned_text
)

tokens[1:5]
## $`1`
## [1] "saiki" "ra"    "iso"   "man"   "cor"  
## 
## $`2`
## [1] "jelek"  "sekali"
## 
## $`3`
## [1] "bagus"  "cinta"  "roblox" "selalu"
## 
## $`4`
##  [1] "game"       "burik"      "jelek"      "kebanyakan" "bug"       
##  [6] "masalah"    "usia"       "juga"       "nggak"      "pant"      
## [11] "nihh"       "game"       "buang"      "aja"       
## 
## $`5`
##  [1] "makin"      "lama"       "makin"      "ngaco"      "main"      
##  [6] "game"       "aja"        "harus"      "verifikasi" "email"     
## [11] "sama"       "aja"        "kek"        "mau"        "nge"       
## [16] "hack"       "masa"       "main"       "game"       "aja"       
## [21] "make"       "email"      "kan"        "lucuðÿ"     "ðÿ"
#========================================================
# 17. PERBANDINGAN TEKS SEBELUM DAN SESUDAH
#========================================================

comparison <- data.frame(
  sebelum = roblox$ulasan[1:5],
  sesudah = cleaned_text[1:5]
)

kable(
  comparison,
  caption = "Perbandingan Sebelum dan Sesudah Preprocessing"
)
Perbandingan Sebelum dan Sesudah Preprocessing
sebelum sesudah
Saiki Ra ISO man cor saiki ra iso man cor
JELEK SEKALI jelek sekali
bagus cinta Roblox selalu bagus cinta roblox selalu
game burik jelek kebanyakan bug masalah usia juga nggak pantes ada nihh game buang aja game burik jelek kebanyakan bug masalah usia juga nggak pant nihh game buang aja
makin lama makin ngaco main game aja harus verifikasi email itu sama aja kek mau nge hack masa main game aja make email kan lucu😹😹 makin lama makin ngaco main game aja harus verifikasi email sama aja kek mau nge hack masa main game aja make email kan lucuðÿ ðÿ
#========================================================
# 18. MEMBUAT DATA BERSIH
#========================================================

data_clean <- data.frame(
  cleaned_text = cleaned_text,
  sentimen = roblox$sentimen
)

head(data_clean)
##                                                                                                                        cleaned_text
## 1                                                                                                              saiki ra iso man cor
## 2                                                                                                                      jelek sekali
## 3                                                                                                         bagus cinta roblox selalu
## 4                                                  game burik jelek kebanyakan bug masalah usia juga nggak pant nihh game buang aja
## 5 makin lama makin ngaco main game aja harus verifikasi email sama aja kek mau nge hack masa main game aja make email kan lucuðÿ ðÿ
## 6                             ive top up my app but the robux hasnt come in yet my money is gone and i cant get anythingðÿ ðÿ ðÿ ðÿ
##   sentimen
## 1  negatif
## 2  negatif
## 3  positif
## 4  negatif
## 5  negatif
## 6  negatif
#========================================================
# 19. MEMBUAT DOCUMENT TERM MATRIX
#========================================================

corpus_clean <- VCorpus(
  VectorSource(data_clean$cleaned_text)
)

dtm <- DocumentTermMatrix(
  corpus_clean
)

#========================================================
# 20. MEMBATASI TERM
#========================================================

dtm <- removeSparseTerms(
  dtm,
  0.99
)

freq_terms <- findFreqTerms(
  dtm,
  lowfreq = 5
)

dtm <- dtm[
  ,
  colnames(dtm) %in% freq_terms
]

#========================================================
# 21. TF-IDF
#========================================================

dtm_tfidf <- weightTfIdf(
  dtm
)
## Warning in weightTfIdf(dtm): empty document(s): 1 11 47 50 56 58 63 71 75 96
## 102 127 155 156 158 159 191 216 225 236 239 245 256 258 264 271 301 307 332 354
## 360 365 388 397 407 419 440 451
tfidf_matrix <- as.matrix(
  dtm_tfidf
)

#========================================================
# 22. BIGRAM
#========================================================

bigram <- tokenize_ngrams(
  cleaned_text,
  n = 2
)

bigram[1:5]
## $`1`
## [1] "saiki ra" "ra iso"   "iso man"  "man cor" 
## 
## $`2`
## [1] "jelek sekali"
## 
## $`3`
## [1] "bagus cinta"   "cinta roblox"  "roblox selalu"
## 
## $`4`
##  [1] "game burik"       "burik jelek"      "jelek kebanyakan" "kebanyakan bug"  
##  [5] "bug masalah"      "masalah usia"     "usia juga"        "juga nggak"      
##  [9] "nggak pant"       "pant nihh"        "nihh game"        "game buang"      
## [13] "buang aja"       
## 
## $`5`
##  [1] "makin lama"       "lama makin"       "makin ngaco"      "ngaco main"      
##  [5] "main game"        "game aja"         "aja harus"        "harus verifikasi"
##  [9] "verifikasi email" "email sama"       "sama aja"         "aja kek"         
## [13] "kek mau"          "mau nge"          "nge hack"         "hack masa"       
## [17] "masa main"        "main game"        "game aja"         "aja make"        
## [21] "make email"       "email kan"        "kan lucuðÿ"       "lucuðÿ ðÿ"
#========================================================
# 23. MEMBAGI DATA TRAINING DAN TESTING
#========================================================

set.seed(123)

index <- createDataPartition(
  data_clean$sentimen,
  p = 0.8,
  list = FALSE
)

x_train <- tfidf_matrix[index, ]
x_test  <- tfidf_matrix[-index, ]

y_train <- as.factor(
  data_clean$sentimen[index]
)

y_test <- as.factor(
  data_clean$sentimen[-index]
)

#========================================================
# 24. MENGUBAH DATA MENJADI DATA FRAME
#========================================================

x_train <- as.data.frame(x_train)
x_test  <- as.data.frame(x_test)

#========================================================
# 25. MELATIH MODEL NAIVE BAYES
#========================================================

model_nb <- naiveBayes(
  x = x_train,
  y = y_train
)

# Menampilkan model Naive Bayes
model_nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = x_train, y = y_train)
## 
## A-priori probabilities:
## y_train
##  negatif  positif 
## 0.538874 0.461126 
## 
## Conditional probabilities:
##          age
## y_train          [,1]       [,2]
##   negatif 0.012589476 0.08025591
##   positif 0.004475338 0.04221010
## 
##          aja
## y_train         [,1]      [,2]
##   negatif 0.03563545 0.1417611
##   positif 0.01318776 0.1498781
## 
##          akun
## y_train          [,1]       [,2]
##   negatif 0.037767497 0.24247001
##   positif 0.007074082 0.06285928
## 
##          anak
## y_train          [,1]       [,2]
##   negatif 0.014761126 0.09248562
##   positif 0.003099464 0.04064909
## 
##          and
## y_train         [,1]      [,2]
##   negatif 0.02259014 0.1790483
##   positif 0.02036292 0.1454516
## 
##          aneh
## y_train         [,1]      [,2]
##   negatif 0.05072338 0.2637936
##   positif 0.00000000 0.0000000
## 
##          anj
## y_train         [,1]      [,2]
##   negatif 0.03289195 0.2713134
##   positif 0.00000000 0.0000000
## 
##          apa
## y_train         [,1]       [,2]
##   negatif 0.02739584 0.21278974
##   positif 0.00614284 0.06262195
## 
##          are
## y_train          [,1]       [,2]
##   negatif 0.011096096 0.09128297
##   positif 0.002113132 0.02771347
## 
##          asik
## y_train          [,1]       [,2]
##   negatif 0.002712379 0.03845461
##   positif 0.012678795 0.16628083
## 
##          bagus
## y_train         [,1]      [,2]
##   negatif 0.02678573 0.1904248
##   positif 0.26803927 0.5984472
## 
##          baik
## y_train          [,1]       [,2]
##   negatif 0.004744929 0.04833289
##   positif 0.006339397 0.08314042
## 
##          balikin
## y_train          [,1]       [,2]
##   negatif 0.013624076 0.11494963
##   positif 0.007301423 0.09575726
## 
##          ban
## y_train          [,1]      [,2]
##   negatif 0.023412113 0.2439016
##   positif 0.005851751 0.0541080
## 
##          banget
## y_train         [,1]      [,2]
##   negatif 0.04560838 0.1961054
##   positif 0.09031087 0.3575135
## 
##          banyak
## y_train         [,1]      [,2]
##   negatif 0.03521411 0.2499447
##   positif 0.07116777 0.2487137
## 
##          baru
## y_train          [,1]       [,2]
##   negatif 0.018308558 0.23238009
##   positif 0.006383729 0.05923391
## 
##          begitu
## y_train          [,1]       [,2]
##   negatif 0.013696068 0.10080303
##   positif 0.003099464 0.04064909
## 
##          beli
## y_train          [,1]       [,2]
##   negatif 0.021456938 0.14155359
##   positif 0.008299717 0.08003098
## 
##          benerin
## y_train         [,1]       [,2]
##   negatif 0.01258275 0.09368959
##   positif 0.01216904 0.15959544
## 
##          bersama
## y_train          [,1]       [,2]
##   negatif 0.001017142 0.01442048
##   positif 0.042338118 0.50020761
## 
##          bgt
## y_train          [,1]       [,2]
##   negatif 0.004649793 0.06592219
##   positif 0.045643661 0.36478208
## 
##          bikin
## y_train          [,1]       [,2]
##   negatif 0.014808772 0.13668444
##   positif 0.008416873 0.07800109
## 
##          bintang
## y_train         [,1]       [,2]
##   negatif 0.02054182 0.09701289
##   positif 0.02920308 0.19991904
## 
##          buat
## y_train          [,1]       [,2]
##   negatif 0.019774018 0.08113835
##   positif 0.008633541 0.09044302
## 
##          bug
## y_train         [,1]      [,2]
##   negatif 0.05110028 0.3668242
##   positif 0.03103572 0.1546316
## 
##          bukan
## y_train          [,1]      [,2]
##   negatif 0.012888517 0.1216196
##   positif 0.001521455 0.0199537
## 
##          burik
## y_train         [,1]    [,2]
##   negatif 0.05212036 0.46541
##   positif 0.03409411 0.44714
## 
##          but
## y_train         [,1]       [,2]
##   negatif 0.01689021 0.12288100
##   positif 0.01155719 0.07997681
## 
##          cant
## y_train         [,1]     [,2]
##   negatif 0.03674409 0.285907
##   positif 0.01622538 0.167933
## 
##          chat
## y_train         [,1]      [,2]
##   negatif 0.08478253 0.3072986
##   positif 0.05133665 0.2058792
## 
##          coba
## y_train          [,1]       [,2]
##   negatif 0.004042813 0.04610364
##   positif 0.011141782 0.08753738
## 
##          cuma
## y_train          [,1]       [,2]
##   negatif 0.025806398 0.21548676
##   positif 0.004729453 0.06202619
## 
##          dah
## y_train          [,1]       [,2]
##   negatif 0.005352306 0.03843373
##   positif 0.000000000 0.00000000
## 
##          develop
## y_train          [,1]       [,2]
##   negatif 0.003192561 0.03201142
##   positif 0.004226265 0.05542694
## 
##          diperbaiki
## y_train          [,1]       [,2]
##   negatif 0.012054462 0.09619345
##   positif 0.003803638 0.04988425
## 
##          dong
## y_train          [,1]       [,2]
##   negatif 0.013478231 0.10914539
##   positif 0.004722948 0.04716193
## 
##          dulu
## y_train          [,1]       [,2]
##   negatif 0.023125866 0.11606028
##   positif 0.002064516 0.02707588
## 
##          eror
## y_train        [,1]      [,2]
##   negatif 0.0463014 0.3180569
##   positif 0.0000000 0.0000000
## 
##          error
## y_train         [,1]      [,2]
##   negatif 0.02696314 0.1964125
##   positif 0.00960147 0.1054096
## 
##          even
## y_train          [,1]       [,2]
##   negatif 0.008137137 0.08137035
##   positif 0.002113132 0.02771347
## 
##          fitur
## y_train         [,1]      [,2]
##   negatif 0.01703986 0.1126794
##   positif 0.02305244 0.2099215
## 
##          fix
## y_train         [,1]      [,2]
##   negatif 0.01220571 0.1287225
##   positif 0.00000000 0.0000000
## 
##          gabisa
## y_train         [,1]       [,2]
##   negatif 0.08085078 0.42909643
##   positif 0.01250893 0.09494928
## 
##          gajela
## y_train         [,1]      [,2]
##   negatif 0.01567755 0.1332881
##   positif 0.00000000 0.0000000
## 
##          gak
## y_train         [,1]      [,2]
##   negatif 0.05783204 0.1764340
##   positif 0.04638872 0.1998073
## 
##          game
## y_train         [,1]      [,2]
##   negatif 0.09425212 0.2432882
##   positif 0.15961421 0.3093285
## 
##          gamenya
## y_train         [,1]       [,2]
##   negatif 0.02136779 0.18133625
##   positif 0.01512198 0.09909258
## 
##          ganti
## y_train          [,1]       [,2]
##   negatif 0.005640541 0.06047194
##   positif 0.000000000 0.00000000
## 
##          gem
## y_train         [,1]      [,2]
##   negatif 0.01227618 0.1395831
##   positif 0.04151409 0.3150968
## 
##          geme
## y_train         [,1]       [,2]
##   negatif 0.01162242 0.09914831
##   positif 0.05084162 0.34574398
## 
##          get
## y_train         [,1]      [,2]
##   negatif 0.01522807 0.1266431
##   positif 0.00000000 0.0000000
## 
##          gimana
## y_train         [,1]      [,2]
##   negatif 0.01960601 0.1247174
##   positif 0.02164180 0.2027238
## 
##          gini
## y_train          [,1]       [,2]
##   negatif 0.006570755 0.04426261
##   positif 0.000000000 0.00000000
## 
##          gitu
## y_train          [,1]      [,2]
##   negatif 0.013854936 0.1106049
##   positif 0.003409411 0.0447140
## 
##          gua
## y_train          [,1]       [,2]
##   negatif 0.033012881 0.15074214
##   positif 0.002020011 0.02649219
## 
##          hacker
## y_train          [,1]       [,2]
##   negatif 0.001049953 0.01488565
##   positif 0.009309604 0.08891372
## 
##          hapus
## y_train          [,1]       [,2]
##   negatif 0.035399409 0.24186264
##   positif 0.004603204 0.06037046
## 
##          hari
## y_train          [,1]       [,2]
##   negatif 0.019597689 0.15967520
##   positif 0.005215302 0.06839804
## 
##          harus
## y_train         [,1]      [,2]
##   negatif 0.02325711 0.1262613
##   positif 0.01926203 0.1023164
## 
##          hilang
## y_train          [,1]      [,2]
##   negatif 0.034357361 0.2357212
##   positif 0.005370405 0.0704322
## 
##          ingin
## y_train          [,1]       [,2]
##   negatif 0.004308505 0.03695251
##   positif 0.001521130 0.01994943
## 
##          internet
## y_train          [,1]       [,2]
##   negatif 0.024396889 0.23086052
##   positif 0.003318829 0.04352603
## 
##          izin
## y_train          [,1]       [,2]
##   negatif 0.006894295 0.04154044
##   positif 0.011350687 0.10641832
## 
##          jangan
## y_train         [,1]      [,2]
##   negatif 0.01889778 0.1014590
##   positif 0.03112013 0.2126508
## 
##          jaringan
## y_train          [,1]       [,2]
##   negatif 0.041850270 0.17926861
##   positif 0.004040022 0.05298439
## 
##          jela
## y_train         [,1]      [,2]
##   negatif 0.04458548 0.4204954
##   positif 0.00000000 0.0000000
## 
##          jelek
## y_train          [,1]       [,2]
##   negatif 0.042912713 0.20934125
##   positif 0.001885343 0.02472605
## 
##          juga
## y_train         [,1]      [,2]
##   negatif 0.02487951 0.1142057
##   positif 0.02787729 0.1540298
## 
##          kalau
## y_train          [,1]       [,2]
##   negatif 0.010421355 0.06886206
##   positif 0.002731899 0.02528084
## 
##          kali
## y_train          [,1]       [,2]
##   negatif 0.007743959 0.06526385
##   positif 0.014897776 0.13860939
## 
##          kalian
## y_train          [,1]       [,2]
##   negatif 0.002169903 0.03076369
##   positif 0.001584849 0.02078510
## 
##          kalo
## y_train         [,1]       [,2]
##   negatif 0.01443717 0.08125020
##   positif 0.01187953 0.07798246
## 
##          kami
## y_train          [,1]       [,2]
##   negatif 0.009985781 0.08341376
##   positif 0.008069906 0.08204741
## 
##          kan
## y_train         [,1]      [,2]
##   negatif 0.04658733 0.2384963
##   positif 0.01954227 0.1865450
## 
##          karna
## y_train         [,1]       [,2]
##   negatif 0.02784238 0.20173333
##   positif 0.01289175 0.08967921
## 
##          kasih
## y_train         [,1]      [,2]
##   negatif 0.02006638 0.1058915
##   positif 0.02305254 0.1897839
## 
##          katanya
## y_train         [,1]      [,2]
##   negatif 0.01438162 0.1021083
##   positif 0.00000000 0.0000000
## 
##          kaya
## y_train          [,1]       [,2]
##   negatif 0.003567634 0.03567927
##   positif 0.003803638 0.04988425
## 
##          kayak
## y_train          [,1]       [,2]
##   negatif 0.014295483 0.09432597
##   positif 0.002347609 0.03078860
## 
##          kek
## y_train          [,1]       [,2]
##   negatif 0.013197447 0.09463063
##   positif 0.006818821 0.08942800
## 
##          kembali
## y_train         [,1]      [,2]
##   negatif 0.02827009 0.2087321
##   positif 0.00000000 0.0000000
## 
##          kembalikan
## y_train         [,1]      [,2]
##   negatif 0.01419054 0.1009516
##   positif 0.04802912 0.3991689
## 
##          kenapa
## y_train         [,1]       [,2]
##   negatif 0.08829927 0.44115618
##   positif 0.01849191 0.08986547
## 
##          kid
## y_train          [,1]      [,2]
##   negatif 0.012258977 0.0952168
##   positif 0.002347609 0.0307886
## 
##          kita
## y_train          [,1]       [,2]
##   negatif 0.009388336 0.06301250
##   positif 0.006441713 0.05034306
## 
##          knp
## y_train          [,1]       [,2]
##   negatif 0.009094447 0.11609203
##   positif 0.003169699 0.04157021
## 
##          kok
## y_train          [,1]       [,2]
##   negatif 0.042253741 0.25542859
##   positif 0.006823054 0.06654087
## 
##          koneksi
## y_train         [,1]      [,2]
##   negatif 0.01985849 0.1722712
##   positif 0.00000000 0.0000000
## 
##          kurang
## y_train          [,1]       [,2]
##   negatif 0.013407717 0.10974681
##   positif 0.002377274 0.03117766
## 
##          lag
## y_train         [,1]      [,2]
##   negatif 0.04796093 0.2797024
##   positif 0.02484684 0.2190398
## 
##          lagi
## y_train         [,1]       [,2]
##   negatif 0.01726328 0.07606474
##   positif 0.03844298 0.18674205
## 
##          lah
## y_train          [,1]       [,2]
##   negatif 0.047539460 0.25607051
##   positif 0.004548703 0.05965568
## 
##          lain
## y_train          [,1]       [,2]
##   negatif 0.008090821 0.05089242
##   positif 0.011559762 0.12048796
## 
##          lama
## y_train          [,1]       [,2]
##   negatif 0.009824616 0.07509032
##   positif 0.004251571 0.04772242
## 
##          lancar
## y_train          [,1]       [,2]
##   negatif 0.026432196 0.13165370
##   positif 0.003202387 0.04199892
## 
##          login
## y_train         [,1]      [,2]
##   negatif 0.03495847 0.2073995
##   positif 0.00000000 0.0000000
## 
##          maaf
## y_train         [,1]      [,2]
##   negatif 0.01801277 0.1477161
##   positif 0.00000000 0.0000000
## 
##          mabar
## y_train         [,1]      [,2]
##   negatif 0.00000000 0.0000000
##   positif 0.01498286 0.1055469
## 
##          main
## y_train         [,1]      [,2]
##   negatif 0.04948029 0.1944355
##   positif 0.02361013 0.1234539
## 
##          makin
## y_train          [,1]       [,2]
##   negatif 0.050698403 0.25225823
##   positif 0.001488429 0.01952056
## 
##          malah
## y_train         [,1]       [,2]
##   negatif 0.04011572 0.13821607
##   positif 0.01020670 0.06660103
## 
##          mana
## y_train          [,1]       [,2]
##   negatif 0.034713468 0.41604865
##   positif 0.004972057 0.05660971
## 
##          map
## y_train         [,1]      [,2]
##   negatif 0.02039849 0.1825995
##   positif 0.03485377 0.2720601
## 
##          masa
## y_train         [,1]      [,2]
##   negatif 0.01939215 0.1111708
##   positif 0.00000000 0.0000000
## 
##          masih
## y_train          [,1]       [,2]
##   negatif 0.005890757 0.06493550
##   positif 0.003668139 0.03961558
## 
##          masuk
## y_train          [,1]       [,2]
##   negatif 0.040027279 0.14469314
##   positif 0.002726311 0.03575524
## 
##          mau
## y_train         [,1]       [,2]
##   negatif 0.05819452 0.28748612
##   positif 0.02025484 0.09116203
## 
##          min
## y_train         [,1]      [,2]
##   negatif 0.01033283 0.0895945
##   positif 0.01981954 0.1902185
## 
##          minta
## y_train          [,1]       [,2]
##   negatif 0.018939469 0.21589679
##   positif 0.003521414 0.04618291
## 
##          mohon
## y_train          [,1]       [,2]
##   negatif 0.008096958 0.07387773
##   positif 0.015842695 0.12136363
## 
##          mulu
## y_train          [,1]       [,2]
##   negatif 0.051904349 0.26492962
##   positif 0.008069841 0.06121331
## 
##          nge
## y_train          [,1]       [,2]
##   negatif 0.004767212 0.04905419
##   positif 0.003169699 0.04157021
## 
##          ngeleg
## y_train          [,1]       [,2]
##   negatif 0.026193831 0.20322693
##   positif 0.003803638 0.04988425
## 
##          nggak
## y_train         [,1]       [,2]
##   negatif 0.01982016 0.14979951
##   positif 0.00140412 0.01841486
## 
##          ngobrol
## y_train          [,1]       [,2]
##   negatif 0.005601918 0.04571896
##   positif 0.002984393 0.02760045
## 
##          not
## y_train         [,1]      [,2]
##   negatif 0.01612633 0.1487158
##   positif 0.01267879 0.1662808
## 
##          orang
## y_train         [,1]      [,2]
##   negatif 0.02350637 0.1163156
##   positif 0.01080806 0.1268859
## 
##          padah
## y_train          [,1]       [,2]
##   negatif 0.068667648 0.17921967
##   positif 0.008138399 0.05741044
## 
##          pakai
## y_train          [,1]       [,2]
##   negatif 0.008289714 0.06244451
##   positif 0.000000000 0.00000000
## 
##          pake
## y_train         [,1]       [,2]
##   negatif 0.02203426 0.12816437
##   positif 0.01448532 0.09517604
## 
##          pas
## y_train         [,1]       [,2]
##   negatif 0.02193118 0.11022972
##   positif 0.01492340 0.08332469
## 
##          pembatasan
## y_train          [,1]       [,2]
##   negatif 0.006062094 0.04810505
##   positif 0.000000000 0.00000000
## 
##          peraturan
## y_train          [,1]       [,2]
##   negatif 0.004940049 0.04029971
##   positif 0.000000000 0.00000000
## 
##          perbaiki
## y_train          [,1]      [,2]
##   negatif 0.019590580 0.1435963
##   positif 0.009401889 0.0776209
## 
##          permainan
## y_train         [,1]      [,2]
##   negatif 0.00000000 0.0000000
##   positif 0.03596297 0.2876092
## 
##          play
## y_train          [,1]       [,2]
##   negatif 0.027296359 0.18704646
##   positif 0.003788234 0.04968222
## 
##          player
## y_train         [,1]       [,2]
##   negatif 0.02624153 0.24241770
##   positif 0.00280824 0.03682972
## 
##          punya
## y_train         [,1]       [,2]
##   negatif 0.00442825 0.03606733
##   positif 0.00000000 0.00000000
## 
##          ribet
## y_train         [,1]      [,2]
##   negatif 0.02612449 0.2073772
##   positif 0.00000000 0.0000000
## 
##          roblox
## y_train         [,1]      [,2]
##   negatif 0.10751259 0.2310351
##   positif 0.06804251 0.2647571
## 
##          robux
## y_train         [,1]      [,2]
##   negatif 0.02521976 0.1424150
##   positif 0.01679790 0.1302904
## 
##          rusak
## y_train         [,1]      [,2]
##   negatif 0.02600384 0.2501118
##   positif 0.00000000 0.0000000
## 
##          saat
## y_train          [,1]       [,2]
##   negatif 0.007575435 0.06059058
##   positif 0.016553085 0.21709167
## 
##          sama
## y_train         [,1]      [,2]
##   negatif 0.02392654 0.1194404
##   positif 0.03409271 0.1543004
## 
##          satu
## y_train          [,1]       [,2]
##   negatif 0.007683113 0.05651572
##   positif 0.008314449 0.07701370
## 
##          sekali
## y_train          [,1]       [,2]
##   negatif 0.002585008 0.03664882
##   positif 0.053704051 0.34546604
## 
##          sekarang
## y_train         [,1]      [,2]
##   negatif 0.04079089 0.1680948
##   positif 0.01221228 0.0737263
## 
##          selalu
## y_train          [,1]       [,2]
##   negatif 0.035057638 0.39909253
##   positif 0.008343308 0.07690337
## 
##          semoga
## y_train          [,1]      [,2]
##   negatif 0.006374091 0.0519365
##   positif 0.000000000 0.0000000
## 
##          semua
## y_train         [,1]      [,2]
##   negatif 0.01882606 0.1759561
##   positif 0.02765170 0.2119156
## 
##          sendiri
## y_train          [,1]       [,2]
##   negatif 0.009276854 0.08220573
##   positif 0.011318829 0.11989173
## 
##          seperti
## y_train          [,1]       [,2]
##   negatif 0.008403385 0.06301916
##   positif 0.000000000 0.00000000
## 
##          sere
## y_train         [,1]       [,2]
##   negatif 0.02930514 0.20488010
##   positif 0.00490003 0.06426329
## 
##          seru
## y_train         [,1]      [,2]
##   negatif 0.01884957 0.1061446
##   positif 0.26912264 0.6652773
## 
##          server
## y_train          [,1]       [,2]
##   negatif 0.021347278 0.17198540
##   positif 0.007301423 0.09575726
## 
##          setiap
## y_train          [,1]       [,2]
##   negatif 0.025428553 0.25775168
##   positif 0.005433769 0.07126321
## 
##          sih
## y_train          [,1]       [,2]
##   negatif 0.009098958 0.06910625
##   positif 0.024862403 0.14506048
## 
##          suka
## y_train         [,1]      [,2]
##   negatif 0.03353192 0.1703275
##   positif 0.09640327 0.3033615
## 
##          susah
## y_train          [,1]       [,2]
##   negatif 0.007558483 0.06441868
##   positif 0.001460285 0.01915145
## 
##          tahun
## y_train          [,1]       [,2]
##   negatif 0.028973249 0.20145134
##   positif 0.003927874 0.05151358
## 
##          tak
## y_train          [,1]       [,2]
##   negatif 0.012503715 0.12150328
##   positif 0.003169699 0.04157021
## 
##          tambah
## y_train          [,1]       [,2]
##   negatif 0.005923532 0.05192854
##   positif 0.007607277 0.09976850
## 
##          tapi
## y_train         [,1]       [,2]
##   negatif 0.02412274 0.07371383
##   positif 0.09353655 0.21450614
## 
##          tau
## y_train          [,1]       [,2]
##   negatif 0.008478888 0.05620412
##   positif 0.000000000 0.00000000
## 
##          teman
## y_train         [,1]       [,2]
##   negatif 0.01075456 0.07969407
##   positif 0.04400920 0.41577449
## 
##          temen
## y_train          [,1]       [,2]
##   negatif 0.013254553 0.10077242
##   positif 0.008191464 0.09284987
## 
##          terus
## y_train          [,1]       [,2]
##   negatif 0.027163857 0.15741042
##   positif 0.008474555 0.05523294
## 
##          tetap
## y_train          [,1]       [,2]
##   negatif 0.010412197 0.08611348
##   positif 0.007231474 0.07185856
## 
##          that
## y_train         [,1]       [,2]
##   negatif 0.01583296 0.11820627
##   positif 0.00280824 0.03682972
## 
##          the
## y_train         [,1]      [,2]
##   negatif 0.03874305 0.2316878
##   positif 0.01594155 0.1072924
## 
##          there
## y_train         [,1]       [,2]
##   negatif 0.01164248 0.08654903
##   positif 0.00737390 0.07540000
## 
##          this
## y_train         [,1]      [,2]
##   negatif 0.02212828 0.2307447
##   positif 0.01497728 0.1635799
## 
##          though
## y_train          [,1]       [,2]
##   negatif 0.008137137 0.08137035
##   positif 0.002113132 0.02771347
## 
##          tiba
## y_train          [,1]       [,2]
##   negatif 0.002712379 0.03845461
##   positif 0.021038440 0.15338148
## 
##          tolong
## y_train         [,1]      [,2]
##   negatif 0.03118359 0.1414002
##   positif 0.03683253 0.1549699
## 
##          top
## y_train          [,1]      [,2]
##   negatif 0.007717004 0.0712449
##   positif 0.027380336 0.2670229
## 
##          udah
## y_train         [,1]       [,2]
##   negatif 0.02918195 0.09419364
##   positif 0.04254743 0.29139825
## 
##          udh
## y_train          [,1]       [,2]
##   negatif 0.001688644 0.02394066
##   positif 0.003318829 0.04352603
## 
##          umur
## y_train          [,1]       [,2]
##   negatif 0.027660572 0.14280101
##   positif 0.002667608 0.02880994
## 
##          updat
## y_train         [,1]      [,2]
##   negatif 0.13481325 0.3870046
##   positif 0.04424352 0.1306901
## 
##          upgrad
## y_train         [,1]      [,2]
##   negatif 0.03763921 0.2440619
##   positif 0.01023105 0.1341790
## 
##          usia
## y_train          [,1]       [,2]
##   negatif 0.023663571 0.23147010
##   positif 0.006084519 0.07979772
## 
##          verif
## y_train          [,1]       [,2]
##   negatif 0.007651754 0.06507571
##   positif 0.017607068 0.16280310
## 
##          verifikasi
## y_train         [,1]       [,2]
##   negatif 0.02003819 0.13148381
##   positif 0.00240179 0.03149919
## 
##          verivikasi
## y_train         [,1]       [,2]
##   negatif 0.01107466 0.09928476
##   positif 0.00000000 0.00000000
## 
##          wajah
## y_train          [,1]       [,2]
##   negatif 0.002958959 0.04195048
##   positif 0.026879045 0.23515056
## 
##          walau
## y_train          [,1]       [,2]
##   negatif 0.000000000 0.00000000
##   positif 0.009684047 0.08479999
## 
##          walaupun
## y_train          [,1]       [,2]
##   negatif 0.009509282 0.08686406
##   positif 0.010143036 0.09670708
## 
##          whi
## y_train         [,1]      [,2]
##   negatif 0.02056628 0.1801272
##   positif 0.01216904 0.1595954
## 
##          wifi
## y_train          [,1]      [,2]
##   negatif 0.038413570 0.1847079
##   positif 0.002058678 0.0269993
## 
##          woi
## y_train         [,1]      [,2]
##   negatif 0.01871541 0.1578842
##   positif 0.00000000 0.0000000
## 
##          yah
## y_train          [,1]       [,2]
##   negatif 0.014104371 0.16037001
##   positif 0.004250278 0.04646562
## 
##          you
## y_train          [,1]       [,2]
##   negatif 0.006508316 0.06638406
##   positif 0.009906845 0.07558768
#========================================================
# 26. MELATIH MODEL SUPPORT VECTOR MACHINE (SVM)
#========================================================

model_svm <- svm(
  x = x_train,
  y = y_train,
  kernel = "linear"
)

# Menampilkan model SVM
model_svm
## 
## Call:
## svm.default(x = x_train, y = y_train, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  175
#========================================================
# 27. PREDIKSI DATA UJI
#========================================================

pred_nb <- predict(
  model_nb,
  x_test
)

pred_svm <- predict(
  model_svm,
  x_test
)

# Menampilkan hasil prediksi
head(pred_nb)
## [1] positif negatif negatif positif positif negatif
## Levels: negatif positif
head(pred_svm)
##       2       3       4      15      19      22 
## negatif negatif negatif positif positif negatif 
## Levels: negatif positif
#========================================================
# 28. CONFUSION MATRIX
#========================================================

cm_nb <- confusionMatrix(
  pred_nb,
  y_test
)

cm_svm <- confusionMatrix(
  pred_svm,
  y_test
)

print(cm_nb)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negatif positif
##    negatif      26       8
##    positif      24      35
##                                           
##                Accuracy : 0.6559          
##                  95% CI : (0.5502, 0.7514)
##     No Information Rate : 0.5376          
##     P-Value [Acc > NIR] : 0.01385         
##                                           
##                   Kappa : 0.3255          
##                                           
##  Mcnemar's Test P-Value : 0.00801         
##                                           
##             Sensitivity : 0.5200          
##             Specificity : 0.8140          
##          Pos Pred Value : 0.7647          
##          Neg Pred Value : 0.5932          
##              Prevalence : 0.5376          
##          Detection Rate : 0.2796          
##    Detection Prevalence : 0.3656          
##       Balanced Accuracy : 0.6670          
##                                           
##        'Positive' Class : negatif         
## 
print(cm_svm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negatif positif
##    negatif      34      14
##    positif      16      29
##                                           
##                Accuracy : 0.6774          
##                  95% CI : (0.5725, 0.7707)
##     No Information Rate : 0.5376          
##     P-Value [Acc > NIR] : 0.004276        
##                                           
##                   Kappa : 0.3533          
##                                           
##  Mcnemar's Test P-Value : 0.855132        
##                                           
##             Sensitivity : 0.6800          
##             Specificity : 0.6744          
##          Pos Pred Value : 0.7083          
##          Neg Pred Value : 0.6444          
##              Prevalence : 0.5376          
##          Detection Rate : 0.3656          
##    Detection Prevalence : 0.5161          
##       Balanced Accuracy : 0.6772          
##                                           
##        'Positive' Class : negatif         
## 
#========================================================
# 29. METRIK EVALUASI
#========================================================

akurasi_nb <- cm_nb$overall["Accuracy"]
akurasi_svm <- cm_svm$overall["Accuracy"]

precision_nb <- cm_nb$byClass["Pos Pred Value"]
precision_svm <- cm_svm$byClass["Pos Pred Value"]

recall_nb <- cm_nb$byClass["Sensitivity"]
recall_svm <- cm_svm$byClass["Sensitivity"]

f1_nb <- 2 * (
  (precision_nb * recall_nb) /
    (precision_nb + recall_nb)
)

f1_svm <- 2 * (
  (precision_svm * recall_svm) /
    (precision_svm + recall_svm)
)

cat("Akurasi Naive Bayes :", akurasi_nb, "\n")
## Akurasi Naive Bayes : 0.655914
cat("Akurasi SVM :", akurasi_svm, "\n")
## Akurasi SVM : 0.6774194
cat("Precision Naive Bayes :", precision_nb, "\n")
## Precision Naive Bayes : 0.7647059
cat("Precision SVM :", precision_svm, "\n")
## Precision SVM : 0.7083333
cat("Recall Naive Bayes :", recall_nb, "\n")
## Recall Naive Bayes : 0.52
cat("Recall SVM :", recall_svm, "\n")
## Recall SVM : 0.68
cat("F1 Score Naive Bayes :", f1_nb, "\n")
## F1 Score Naive Bayes : 0.6190476
cat("F1 Score SVM :", f1_svm, "\n")
## F1 Score SVM : 0.6938776
#========================================================
# 30. TABEL CONFUSION MATRIX
#========================================================

table(
  Actual = y_test,
  Predicted = pred_svm
)
##          Predicted
## Actual    negatif positif
##   negatif      34      16
##   positif      14      29
#========================================================
# 31. HEATMAP CONFUSION MATRIX NAIVE BAYES
#========================================================

cm_nb_table <- table(
  Actual = y_test,
  Predicted = pred_nb
)

cm_nb_df <- as.data.frame(
  cm_nb_table
)

ggplot(
  cm_nb_df,
  aes(
    x = Predicted,
    y = Actual,
    fill = Freq
  )
) +
  geom_tile() +
  geom_text(
    aes(label = Freq),
    color = "white",
    size = 5
  ) +
  labs(
    title = "Heatmap Confusion Matrix Naive Bayes",
    x = "Prediksi",
    y = "Aktual"
  ) +
  theme_minimal()

#========================================================
# 32. HEATMAP CONFUSION MATRIX SVM
#========================================================

cm_svm_table <- table(
  Actual = y_test,
  Predicted = pred_svm
)

cm_svm_df <- as.data.frame(
  cm_svm_table
)

ggplot(
  cm_svm_df,
  aes(
    x = Predicted,
    y = Actual,
    fill = Freq
  )
) +
  geom_tile() +
  geom_text(
    aes(label = Freq),
    color = "white",
    size = 5
  ) +
  labs(
    title = "Heatmap Confusion Matrix SVM",
    x = "Prediksi",
    y = "Aktual"
  ) +
  theme_minimal()

#========================================================
# 33. CEK IMBALANCE CLASS
#========================================================

table(
  data_clean$sentimen
)
## 
## negatif positif 
##     251     215
#========================================================
# 34. CEK OVERFITTING
#========================================================

pred_train <- predict(
  model_svm,
  x_train
)

train_acc <- mean(
  pred_train == y_train
)

test_acc <- mean(
  pred_svm == y_test
)

cat("Akurasi Training :", train_acc, "\n")
## Akurasi Training : 0.9544236
cat("Akurasi Testing :", test_acc, "\n")
## Akurasi Testing : 0.6774194
#========================================================
# 35. MENENTUKAN KATA PALING BERPENGARUH
#========================================================

svm_coef <- t(model_svm$coefs) %*% model_svm$SV

svm_coef <- as.vector(
  svm_coef
)

coef_df <- data.frame(
  kata = colnames(x_train),
  koefisien = svm_coef
)

#========================================================
# 36. TOP 10 KATA POSITIF
#========================================================

top_positif <- coef_df %>%
  arrange(desc(koefisien)) %>%
  slice(1:10)

kable(
  top_positif,
  caption = "Top 10 Kata yang Berpengaruh pada Sentimen Positif"
)
Top 10 Kata yang Berpengaruh pada Sentimen Positif
kata koefisien
jelek 1.1035393
updat 0.9125950
jaringan 0.8712268
get 0.6383577
chat 0.6027311
bukan 0.5404523
masa 0.4636324
kayak 0.4601234
tahun 0.4464002
semoga 0.4311776
#========================================================
# 37. TOP 10 KATA NEGATIF
#========================================================

top_negatif <- coef_df %>%
  arrange(koefisien) %>%
  slice(1:10)

kable(
  top_negatif,
  caption = "Top 10 Kata yang Berpengaruh pada Sentimen Negatif"
)
Top 10 Kata yang Berpengaruh pada Sentimen Negatif
kata koefisien
tapi -0.8629167
seru -0.7517827
sekali -0.7428487
lagi -0.7000926
fitur -0.4544341
kek -0.4310310
gem -0.3939529
tiba -0.3930159
sama -0.3577497
permainan -0.3541397
#========================================================
# 38. WORDCLOUD SENTIMEN POSITIF
#========================================================

# install.packages("wordcloud2")

library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.4.3
positif <- data_clean %>%
  filter(sentimen == "positif")

text_pos <- paste(
  positif$cleaned_text,
  collapse = " "
)

kata_pos <- table(
  unlist(
    strsplit(text_pos, " ")
  )
)

df_pos <- data.frame(
  kata = names(kata_pos),
  frekuensi = as.numeric(kata_pos)
)

df_pos <- df_pos %>%
  filter(kata != "") %>%
  arrange(desc(frekuensi)) %>%
  slice(1:100)

wordcloud2(
  data = df_pos,
  size = 0.8
)
#========================================================
# 39. WORDCLOUD SENTIMEN NEGATIF
#========================================================

negatif <- data_clean %>%
  filter(sentimen == "negatif")

text_neg <- paste(
  negatif$cleaned_text,
  collapse = " "
)

kata_neg <- table(
  unlist(
    strsplit(text_neg, " ")
  )
)

df_neg <- data.frame(
  kata = names(kata_neg),
  frekuensi = as.numeric(kata_neg)
)

df_neg <- df_neg %>%
  filter(kata != "") %>%
  arrange(desc(frekuensi)) %>%
  slice(1:100)

wordcloud2(
  data = df_neg,
  size = 0.8
)
#========================================================
# 40. MENYIMPAN DATA BERSIH
#========================================================

write.csv(
  data_clean,
  "hasil_preprocessing_roblox.csv",
  row.names = FALSE
)