# ============================================================
# ANALISIS SENTIMEN KOMENTAR TIKTOK - NADIEM MAKARIM
# ============================================================


# TAHAP 1: PERSIAPAN

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.5.3
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.5.3
## Loading required package: RColorBrewer
library(RColorBrewer)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(tm)
## Warning: package 'tm' was built under R version 4.5.3
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(showtext)
## Loading required package: sysfonts
## Loading required package: showtextdb
# TAHAP 2: BACA DATA

df <- read_excel("UAS TM.xlsx")
## Warning: Expecting numeric in Q1135 / R1135C17: got
## 'https://p19-common-sign.tiktokcdn-us.com/tos-alisg-avt-0068/6c84f815f107f50ada31b259f08137a9~tplv-tiktokx-cropcenter:100:100.jpg?dr=9640&refresh_token=bf628aea&x-expires=1779022800&x-signature=%2B2M%2BNzKkgZs0846VRCtdjezD2Uw%3D&t=4d5b0474&ps=13740610&shp=30310797&shcp=ff37627b&idc=useast8'
## Warning: Expecting numeric in Q1304 / R1304C17: got
## 'https://www.tiktok.com/@hukum.perubahan/video/7613662301048065298'
## Warning: Expecting logical in T1304 / R1304C20: got
## 'https://p16-common-sign.tiktokcdn-us.com/tos-alisg-avt-0068/e6a08aeec43ef5a586fd93c77f93ce88~tplv-tiktokx-cropcenter:100:100.jpg?dr=9640&refresh_token=00ce081c&x-expires=1779022800&x-signature=Qy3e0Qnh0cRCYHF6FslHr3fIBWg%3D&t=4d5b0474&ps=13740610&shp=30310797&shcp=ff37627b&idc=useast5'
## Warning: Expecting numeric in Q1987 / R1987C17: got '2026-05-16T11:57:36.000Z'
## Warning: Expecting logical in S1987 / R1987C19: got
## 'https://www.tiktok.com/@awbimax/video/7639644459986210069'
## Warning: Expecting logical in V1987 / R1987C22: got
## 'https://p16-common-sign.tiktokcdn-us.com/tos-alisg-avt-0068/49b4a87de55aa0c13864c608a083661c~tplv-tiktokx-cropcenter:100:100.jpg?dr=9640&refresh_token=1dbcefc6&x-expires=1779022800&x-signature=sq1iQcVyos3ATlGPC8%2Fa8%2FedX24%3D&t=4d5b0474&ps=13740610&shp=30310797&shcp=ff37627b&idc=useast8'
## Warning: Expecting numeric in Q3353 / R3353C17: got
## 'https://p16-common-sign.tiktokcdn-us.com/tos-alisg-avt-0068/d8ed80cc03e08eba5f9044bbbf698761~tplv-tiktokx-cropcenter:100:100.jpg?dr=9640&refresh_token=7c4ddc53&x-expires=1779022800&x-signature=p9wHCuRmgEaE1dOCjdX88O7WCCw%3D&t=4d5b0474&ps=13740610&shp=30310797&shcp=ff37627b&idc=useast8'
## Warning: Expecting numeric in Q3799 / R3799C17: got
## 'https://p16-common-sign.tiktokcdn.com/tos-alisg-avt-0068/c79d586927b8199bcc5428c68cf256e9~tplv-tiktokx-cropcenter:100:100.jpg?dr=14579&refresh_token=26d0f079&x-expires=1779022800&x-signature=hS31oE26lZCKw9PWCZnlaOxv2pY%3D&t=4d5b0474&ps=13740610&shp=30310797&shcp=ff37627b&idc=my2'
## Warning: Expecting numeric in Q3973 / R3973C17: got
## 'https://p16-common-sign.tiktokcdn-us.com/tos-alisg-avt-0068/18ff0cf74deba0c965fe4ba1e917081d~tplv-tiktokx-cropcenter:100:100.jpg?dr=9640&refresh_token=8d99dc8a&x-expires=1779022800&x-signature=2%2BXrW%2BMllzRq22p1ZZDBMchx%2BCA%3D&t=4d5b0474&ps=13740610&shp=30310797&shcp=ff37627b&idc=useast5'
## New names:
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
## • `` -> `...16`
## • `` -> `...17`
## • `` -> `...18`
## • `` -> `...19`
## • `` -> `...20`
## • `` -> `...21`
## • `` -> `...22`
View(df)
glimpse(df)
## Rows: 4,527
## Columns: 22
## $ text              <chr> "seandainya saya ketua MBG.", "ini lah mengapa orang…
## $ diggCount         <chr> NA, "mending kerja diluar negeri", NA, "perxayalah..…
## $ replyCommentTotal <chr> NA, "gaji gede ga urusan sama penjara.", NA, "jauh d…
## $ createTimeISO     <chr> NA, "5845", NA, "3081", "2026-05-14T02:20:05.000Z", …
## $ uniqueId          <chr> NA, "26", NA, "62", "aw_rich1996", "2026-03-05T21:20…
## $ videoWebUrl       <chr> NA, "2026-03-05T19:27:17.000Z", NA, "2026-03-05T13:4…
## $ uid               <chr> NA, "maya_jkt", NA, "qq_ay", "6.7093563374809897E+18…
## $ cid               <chr> NA, "https://www.tiktok.com/@hukum.perubahan/video/7…
## $ avatarThumbnail   <chr> NA, "6.8414617610617395E+18", NA, "7.029302470825230…
## $ ...10             <chr> NA, "7.6138552551391703E+18", NA, "7.613767892466870…
## $ ...11             <chr> NA, "https://p19-common-sign.tiktokcdn-us.com/tos-al…
## $ ...12             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...13             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...14             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...15             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...16             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...17             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...18             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...19             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...20             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...21             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ...22             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
head(df$text, 10)
##  [1] "seandainya saya ketua MBG."                                                                                                                                                             
##  [2] "ini lah mengapa orang2 pinter males masuk pemerintahan"                                                                                                                                 
##  [3] "MOMMY UNIVERSE will help You pak Nadim."                                                                                                                                                
##  [4] "sy kenal nadiem dr sejak di US"                                                                                                                                                         
##  [5] "Ada yg bisa bantuin bapak ini gak? 😭"                                                                                                                                                  
##  [6] "4"                                                                                                                                                                                      
##  [7] "PARA OJEK ON LINE KALIAN KEMANA SIIH..MASA NGK ADA YG BELA BAPAK KALIAN BAHKAN NADIEM ADALAH BAPAK KENDARAAN ON LINE INDONESIA"                                                         
##  [8] "salah Nadim cuma 1 ."                                                                                                                                                                   
##  [9] "Nadiem pegang rahasia apa ya"                                                                                                                                                           
## [10] "Yang menurut saya lebih keliru lagi adalah kenapa pilih laptop chromebook yg berbasis android dan harus online pemakaiannnya karena tdk bisa instal office misalnya seperti di windows."
# TAHAP 3: PREPROCESSING

bersihkan_teks <- function(x) {
  x %>%
    str_to_lower() %>%
    str_remove_all("http\\S+|www\\S+") %>%
    str_remove_all("@\\w+") %>%
    str_remove_all("#\\w+") %>%
    str_remove_all("[0-9]+") %>%
    str_remove_all("[^a-z\\s]") %>%
    str_squish()
}

df <- df %>%
  filter(!is.na(text), str_length(text) > 3) %>%
  mutate(text_bersih = bersihkan_teks(text))

head(df %>% select(text, text_bersih), 5)
## # A tibble: 5 × 2
##   text                                                     text_bersih          
##   <chr>                                                    <chr>                
## 1 "seandainya saya ketua MBG."                             seandainya saya ketu…
## 2 "ini lah mengapa orang2 pinter males masuk pemerintahan" ini lah mengapa oran…
## 3 "MOMMY UNIVERSE will help You pak Nadim."                mommy universe will …
## 4 "sy kenal nadiem dr sejak di US"                         sy kenal nadiem dr s…
## 5 "Ada yg bisa bantuin bapak ini gak? \U0001f62d"          ada yg bisa bantuin …
# TAHAP 4: NORMALISASI TEKS

kamus_normalisasi <- c(
  "gua" = "saya", "gue" = "saya", "gw" = "saya", "aku" = "saya", "sy" = "saya",
  "w" = "saya",
  "ngk" = "tidak",
  "lo" = "kamu", "lu" = "kamu", "elo" = "kamu",
  "dia" = "dia", "dy" = "dia", "dya" = "dia",
  "kita" = "kita", "kt" = "kita",
  "mereka" = "mereka", "mrk" = "mereka",
  "nggak" = "tidak", "ngga" = "tidak", "gak" = "tidak", "ga" = "tidak",
  "enggak" = "tidak", "engga" = "tidak", "kagak" = "tidak",
  "ndak" = "tidak", "ndk" = "tidak", "g" = "tidak", "gk" = "tidak",
  "tak" = "tidak", "tdk" = "tidak",
  "blm" = "belum", "blom" = "belum",
  "udh" = "sudah", "udah" = "sudah", "dah" = "sudah", "sdh" = "sudah",
  "emg" = "memang", "emang" = "memang",
  "krn" = "karena", "karna" = "karena", "krna" = "karena",
  "klo" = "kalau", "kalu" = "kalau", "kl" = "kalau", "klw" = "kalau",
  "tp" = "tapi", "tpi" = "tapi",
  "spy" = "supaya", "biar" = "supaya",
  "yg" = "yang", "yng" = "yang",
  "dgn" = "dengan", "dg" = "dengan", "sm" = "sama",
  "utk" = "untuk", "buat" = "untuk", "tuk" = "untuk",
  "dr" = "dari", "dri" = "dari",
  "pd" = "pada", "ke" = "ke",
  "jg" = "juga", "jga" = "juga",
  "sdg" = "sedang", "lagi" = "sedang", "lg" = "sedang",
  "msh" = "masih",
  "hrs" = "harus",
  "bs" = "bisa", "bsa" = "bisa",
  "mau" = "mau", "mo" = "mau",
  "aja" = "saja", "aj" = "saja",
  "doang" = "saja",
  "banget" = "sangat", "bgt" = "sangat", "bngt" = "sangat",
  "byk" = "banyak", "bnyk" = "banyak",
  "sgt" = "sangat",
  "skrg" = "sekarang", "skrng" = "sekarang",
  "dulu" = "dulu", "dl" = "dulu",
  "lbh" = "lebih",
  "krg" = "kurang",
  "bilang" = "berkata", "ngomong" = "berkata", "ngbln" = "berkata",
  "kerja" = "bekerja", "krja" = "bekerja",
  "makan" = "makan", "mkn" = "makan",
  "nonton" = "menonton", "ntn" = "menonton",
  "beli" = "membeli",
  "kasih" = "memberi", "ksh" = "memberi",
  "liat" = "melihat", "lht" = "melihat",
  "tau" = "tahu", "tw" = "tahu",
  "bikin" = "membuat", "bkin" = "membuat",
  "nyari" = "mencari",
  "nemu" = "menemukan",
  "nyebut" = "menyebut",
  "ngerti" = "mengerti",
  "mikir" = "berpikir", "mikirin" = "memikirkan",
  "ngerasa" = "merasa",
  "dapet" = "mendapat", "dpet" = "mendapat",
  "bayar" = "membayar",
  "pake" = "menggunakan", "make" = "menggunakan", "pk" = "menggunakan",
  "bagus" = "bagus", "keren" = "bagus", "mantap" = "bagus",
  "mantul" = "bagus", "josss" = "bagus", "joss" = "bagus",
  "jelek" = "jelek", "parah" = "parah", "ancur" = "hancur",
  "gila" = "gila", "gilak" = "gila",
  "susah" = "sulit",
  "gampang" = "mudah", "enteng" = "mudah",
  "males" = "malas", "mls" = "malas",
  "capek" = "lelah", "cape" = "lelah",
  "sedih" = "sedih",
  "seneng" = "senang", "senang" = "senang",
  "kesel" = "kesal", "bete" = "kesal",
  "marah" = "marah", "emosi" = "marah",
  "kaget" = "terkejut",
  "lucu" = "lucu", "ngakak" = "lucu", "wkwk" = "lucu", "wkwkwk" = "lucu",
  "mewek" = "menangis",
  "murah" = "murah", "mahal" = "mahal",
  "bener" = "benar", "bnr" = "benar", "bner" = "benar",
  "salah" = "salah", "slah" = "salah",
  "penting" = "penting", "pnting" = "penting",
  "hebat" = "hebat",
  "mantep" = "bagus",
  "pinter" = "pintar",
  "pemerintah" = "pemerintah", "pmrntah" = "pemerintah",
  "menteri" = "menteri", "mntri" = "menteri",
  "pendidikan" = "pendidikan", "pdidikan" = "pendidikan",
  "sekolah" = "sekolah", "sklh" = "sekolah",
  "kampus" = "kampus", "kmpus" = "kampus",
  "rakyat" = "rakyat", "rkyat" = "rakyat",
  "negara" = "negara", "nkri" = "negara",
  "uang" = "uang", "duit" = "uang",
  "gaji" = "gaji",
  "korupsi" = "korupsi", "korup" = "korupsi",
  "kebijakan" = "kebijakan",
  "wkwk" = "lucu", "haha" = "lucu", "hihi" = "lucu",
  "lol" = "lucu",
  "hmm" = "", "hmmm" = "", "eh" = "", "ah" = "", "oh" = "",
  "yah" = "", "ya" = "", "iya" = "ya",
  "dong" = "", "deh" = "", "sih" = "", "nih" = "",
  "kan" = "", "lah" = "",
  "woi" = "", "hey" = "", "hei" = "", "baekkk" = "baik"
)

normalisasi <- function(x) {
  kata <- str_split(x, "\\s+")[[1]]
  kata_baru <- ifelse(kata %in% names(kamus_normalisasi), kamus_normalisasi[kata], kata)
  kata_baru <- kata_baru[kata_baru != ""]
  paste(kata_baru, collapse = " ")
}

df <- df %>%
  mutate(text_bersih = sapply(text_bersih, normalisasi))
# TAHAP 5: PENANGANAN NEGASI

kata_negasi <- c("tidak", "tak", "bukan", "belum", "jangan", "ga", "gak",
                 "nggak", "ngga", "enggak", "ndk", "ndak", "kagak", "gk")

pola_negasi <- paste0("\\b(", paste(kata_negasi, collapse = "|"), ")\\s+(\\w+)")

tangani_negasi <- function(x) {
  str_replace_all(x, pola_negasi, "\\1_\\2")
}

df <- df %>%
  mutate(text_bersih = tangani_negasi(text_bersih))
# TAHAP 6: LABELING SENTIMEN (LEXICON-BASED)

positif <- read.table("C:/Users/fadhi/Downloads/positive.tsv", header = TRUE, sep = "\t", col.names = c("word", "skor"))
negatif <- read.table("C:/Users/fadhi/Downloads/negative.tsv", header = TRUE, sep = "\t", col.names = c("word", "skor"))

kamus <- bind_rows(
  positif %>% mutate(sentimen = "positif"),
  negatif %>% mutate(sentimen = "negatif")
) %>% select(word, sentimen)

token <- df %>%
  select(text_bersih) %>%
  mutate(id = row_number()) %>%
  unnest_tokens(word, text_bersih)

hasil_kata <- token %>%
  inner_join(kamus, by = "word")
## Warning in inner_join(., kamus, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 20 of `x` matches multiple rows in `y`.
## ℹ Row 7285 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
skor_per_komentar <- hasil_kata %>%
  group_by(id) %>%
  count(sentimen) %>%
  pivot_wider(names_from = sentimen, values_from = n, values_fill = 0) %>%
  mutate(
    label = case_when(
      positif > negatif ~ "Positif",
      negatif > positif ~ "Negatif",
      TRUE              ~ "Netral"
    )
  )

df <- df %>%
  mutate(id = row_number()) %>%
  left_join(skor_per_komentar %>% select(id, label), by = "id") %>%
  mutate(label = replace_na(label, "Netral"))

# Tampilkan distribusi label lexicon
cat("=== Distribusi Label Sentimen (Lexicon) ===\n")
## === Distribusi Label Sentimen (Lexicon) ===
print(table(df$label))
## 
## Negatif  Netral Positif 
##    1885    1770     666
# Visualisasi distribusi lexicon
font_add_google("Poppins", "poppins")
showtext_auto()

df_lexicon <- df %>%
  count(label) %>%
  mutate(
    persen = round(n / sum(n) * 100, 1),
    label = factor(label, levels = c("Positif", "Negatif", "Netral"))
  )

ggplot(df_lexicon, aes(x = label, y = n, fill = label)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = paste0(n, "\n(", persen, "%)")),
            vjust = -0.5, size = 4, family = "poppins", color = "#444441") +
  scale_fill_manual(values = c(
    "Positif" = "#2C2C2A",
    "Negatif" = "#888780",
    "Netral"  = "#D3D1C7"
  )) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Distribusi Sentimen — Lexicon Based",
    subtitle = "Pelabelan awal menggunakan kamus kata positif & negatif",
    x = NULL, y = "Jumlah Komentar"
  ) +
  theme_minimal(base_family = "poppins") +
  theme(
    plot.title         = element_text(size = 14, face = "bold", color = "#2C2C2A"),
    plot.subtitle      = element_text(size = 11, color = "#888780", margin = margin(b = 12)),
    axis.text.x        = element_text(size = 12, color = "#2C2C2A"),
    axis.text.y        = element_text(size = 10, color = "#888780"),
    axis.title.y       = element_text(size = 11, color = "#888780"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.y = element_line(color = "#F1EFE8", linewidth = 0.5),
    plot.margin        = margin(20, 20, 20, 20)
  )

# TAHAP 7: KLASIFIKASI SVM

corpus <- Corpus(VectorSource(df$text_bersih))
dtm <- DocumentTermMatrix(corpus, control = list(
  weighting = weightTfIdf,
  minDocFreq = 5
))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
## Warning in weighting(x): empty document(s): up up up up up up up t ke up up up
## up up up up iya iya iya up up up up up up up up up up t up up up nih ri t ri t
dtm <- removeSparseTerms(dtm, 0.99)
dtm_df <- as.data.frame(as.matrix(dtm))

cat("=== Top 10 Kata dengan TF-IDF Tertinggi ===\n")
## === Top 10 Kata dengan TF-IDF Tertinggi ===
top_tfidf <- data.frame(
  Kata = colnames(dtm_df),
  Total_TFIDF = colSums(dtm_df)
) %>%
  arrange(desc(Total_TFIDF)) %>%
  head(10)

print(top_tfidf)
##            Kata Total_TFIDF
## sticker sticker    361.2948
## pak         pak    321.1924
## sangat   sangat    279.5594
## yang       yang    246.9411
## ini         ini    242.6964
## saya       saya    239.0225
## nadiem   nadiem    228.8835
## negara   negara    184.8011
## setuju   setuju    178.4456
## benar     benar    176.1601
df <- df[1:nrow(dtm_df), ]
dtm_df$label <- as.factor(df$label) # Kolom non-numerik baru dimasukkan di sini

set.seed(42)
idx <- createDataPartition(dtm_df$label, p = 0.8, list = FALSE)
data_train <- dtm_df[idx, ]
data_test  <- dtm_df[-idx, ]

model_svm <- svm(label ~ ., data = data_train, kernel = "linear", cost = 1)
prediksi <- predict(model_svm, data_test)

# Tampilkan confusion matrix & evaluasi
cat("=== Evaluasi Model SVM ===\n")
## === Evaluasi Model SVM ===
print(confusionMatrix(prediksi, data_test$label))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negatif Netral Positif
##    Negatif     260     55      14
##    Netral      112    284      85
##    Positif       5     15      34
## 
## Overall Statistics
##                                           
##                Accuracy : 0.669           
##                  95% CI : (0.6365, 0.7003)
##     No Information Rate : 0.4363          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4447          
##                                           
##  Mcnemar's Test P-Value : 1.117e-15       
## 
## Statistics by Class:
## 
##                      Class: Negatif Class: Netral Class: Positif
## Sensitivity                  0.6897        0.8023        0.25564
## Specificity                  0.8583        0.6137        0.97264
## Pos Pred Value               0.7903        0.5904        0.62963
## Neg Pred Value               0.7813        0.8172        0.87778
## Prevalence                   0.4363        0.4097        0.15394
## Detection Rate               0.3009        0.3287        0.03935
## Detection Prevalence         0.3808        0.5567        0.06250
## Balanced Accuracy            0.7740        0.7080        0.61414
df$label_svm <- predict(model_svm, dtm_df[, -ncol(dtm_df)])

# Tampilkan distribusi label SVM
cat("=== Distribusi Label Sentimen (SVM) ===\n")
## === Distribusi Label Sentimen (SVM) ===
print(table(df$label_svm))
## 
## Negatif  Netral Positif 
##    1625    2423     273
# Visualisasi distribusi SVM
df_svm <- df %>%
  count(label_svm) %>%
  mutate(
    persen = round(n / sum(n) * 100, 1),
    label_svm = factor(label_svm, levels = c("Positif", "Negatif", "Netral"))
  )

ggplot(df_svm, aes(x = label_svm, y = n, fill = label_svm)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = paste0(n, "\n(", persen, "%)")),
            vjust = -0.5, size = 4, family = "poppins", color = "#444441") +
  scale_fill_manual(values = c(
    "Positif" = "#2C2C2A",
    "Negatif" = "#888780",
    "Netral"  = "#D3D1C7"
  )) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Distribusi Sentimen — Klasifikasi SVM",
    subtitle = "Hasil akhir prediksi model Support Vector Machine",
    x = NULL, y = "Jumlah Komentar"
  ) +
  theme_minimal(base_family = "poppins") +
  theme(
    plot.title         = element_text(size = 14, face = "bold", color = "#2C2C2A"),
    plot.subtitle      = element_text(size = 11, color = "#888780", margin = margin(b = 12)),
    axis.text.x        = element_text(size = 12, color = "#2C2C2A"),
    axis.text.y        = element_text(size = 10, color = "#888780"),
    axis.title.y       = element_text(size = 11, color = "#888780"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.y = element_line(color = "#F1EFE8", linewidth = 0.5),
    plot.margin        = margin(20, 20, 20, 20)
  )

# Visualisasi confusion matrix
cm <- confusionMatrix(prediksi, data_test$label)

cm_df <- as.data.frame(cm$table)
colnames(cm_df) <- c("Prediksi", "Aktual", "Jumlah")

cm_df <- cm_df %>%
  group_by(Aktual) %>%
  mutate(Persen = round(Jumlah / sum(Jumlah) * 100, 1)) %>%
  ungroup()

ggplot(cm_df, aes(x = Aktual, y = Prediksi, fill = Jumlah)) +
  geom_tile(color = "white", linewidth = 2) +
  geom_text(aes(label = paste0(Jumlah, "\n(", Persen, "%)")),
            color = "white", size = 4.5, fontface = "bold", family = "poppins") +
  scale_fill_gradient(low = "#B4B2A9", high = "#2C2C2A") +
  labs(
    title = "Confusion Matrix — Klasifikasi Sentimen SVM",
    subtitle = "Perbandingan nilai prediksi dan nilai aktual",
    x = "Nilai Aktual",
    y = "Nilai Prediksi",
    fill = "Jumlah"
  ) +
  theme_minimal(base_family = "poppins") +
  theme(
    plot.title    = element_text(size = 14, face = "bold", color = "#2C2C2A"),
    plot.subtitle = element_text(size = 11, color = "#888780", margin = margin(b = 12)),
    axis.text     = element_text(size = 12, color = "#2C2C2A"),
    axis.title    = element_text(size = 11, color = "#888780"),
    panel.grid    = element_blank(),
    legend.title  = element_text(size = 10, color = "#888780"),
    plot.margin   = margin(20, 20, 20, 20)
  )

# Cari baris ini di kode Anda (berada di bagian bawah Tahap 7):
cm <- confusionMatrix(prediksi, data_test$label, mode = "everything")


# Lalu cetak detail metriknya untuk melihat F1-Score:
print(cm$byClass)
##                Sensitivity Specificity Pos Pred Value Neg Pred Value Precision
## Class: Negatif   0.6896552   0.8583162      0.7902736      0.7813084 0.7902736
## Class: Netral    0.8022599   0.6137255      0.5904366      0.8172324 0.5904366
## Class: Positif   0.2556391   0.9726402      0.6296296      0.8777778 0.6296296
##                   Recall        F1 Prevalence Detection Rate
## Class: Negatif 0.6896552 0.7365439  0.4363426     0.30092593
## Class: Netral  0.8022599 0.6802395  0.4097222     0.32870370
## Class: Positif 0.2556391 0.3636364  0.1539352     0.03935185
##                Detection Prevalence Balanced Accuracy
## Class: Negatif             0.380787         0.7739857
## Class: Netral              0.556713         0.7079927
## Class: Positif             0.062500         0.6141397
print(confusionMatrix(prediksi, data_test$label))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negatif Netral Positif
##    Negatif     260     55      14
##    Netral      112    284      85
##    Positif       5     15      34
## 
## Overall Statistics
##                                           
##                Accuracy : 0.669           
##                  95% CI : (0.6365, 0.7003)
##     No Information Rate : 0.4363          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4447          
##                                           
##  Mcnemar's Test P-Value : 1.117e-15       
## 
## Statistics by Class:
## 
##                      Class: Negatif Class: Netral Class: Positif
## Sensitivity                  0.6897        0.8023        0.25564
## Specificity                  0.8583        0.6137        0.97264
## Pos Pred Value               0.7903        0.5904        0.62963
## Neg Pred Value               0.7813        0.8172        0.87778
## Prevalence                   0.4363        0.4097        0.15394
## Detection Rate               0.3009        0.3287        0.03935
## Detection Prevalence         0.3808        0.5567        0.06250
## Balanced Accuracy            0.7740        0.7080        0.61414
# TAHAP 8: VISUALISASI WORD CLOUD

stopwords_id <- c("yang", "dan", "di", "ke", "dari", "dengan", "untuk", "pada",
                  "adalah", "ini", "itu", "juga", "sudah", "atau", "tapi", "karena",
                  "kalau", "bisa", "saya", "kamu", "dia", "kita", "mereka", "kami",
                  "ada", "tidak", "akan", "lebih", "banyak", "sama", "saja", "masih",
                  "dalam", "oleh", "atas", "bawah", "antara", "lagi", "saat", "sini",
                  "sana", "mana", "apa", "siapa", "kenapa", "gimana", "bagaimana",
                  "memang", "sangat", "sekali", "seperti", "semua", "setiap", "harus",
                  "mau", "maka", "jadi", "ketika", "waktu", "lalu", "setelah", "sebelum",
                  "pak", "bapak", "mas", "bang", "kak",
                  "nadiem", "nadim", "makarim", "beliau", "nya", "presiden",
                  "baik", "ing", "bim", "kok", "kalo", "tuh",
                  "makin", "selalu", "dulu", "jujur", "semangat", "pasti",
                  "tolong", "bantu", "membuat", "terjadi", "hidup", "dunia",
                  "negeri", "negri", "org", "uppo", "ceo", "mntri",
                  "bahwa", "namun", "serta", "pun", "agar", "hingga",
                  "sejak", "selama", "sambil", "malah", "justru", "bahkan",
                  "apalagi", "padahal", "sedangkan", "meski", "walaupun",
                  "meskipun", "apakah", "apabila", "biasanya", "hanya", "cuma",
                  "telah", "sedang", "baru")

warna_sentimen <- list(
  "Positif" = c("#2C2C2A", "#444441", "#5F5E5A", "#888780", "#B4B2A9", "#D3D1C7"),
  "Negatif" = c("#2C2C2A", "#444441", "#5F5E5A", "#888780", "#B4B2A9", "#D3D1C7"),
  "Netral"  = c("#2C2C2A", "#444441", "#5F5E5A", "#888780", "#B4B2A9", "#D3D1C7")
)

library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.5.3
library(htmlwidgets)

tambah_judul <- function(wc, judul) {
  prependContent(wc, htmltools::tags$h3(
    judul,
    style = "text-align:center; font-family:'Poppins',sans-serif; 
             font-size:18px; font-weight:bold; color:#2C2C2A; margin-bottom:8px;"
  ))
}

# Word Cloud — Positif
kata_positif <- df %>%
  filter(label_svm == "Positif") %>%
  unnest_tokens(word, text_bersih) %>%
  filter(!word %in% stopwords_id) %>%
  filter(!str_detect(word, "^(tidak|tak|bukan|belum|jangan|ga|gak|nggak|ngga|enggak|ndak|kagak|gk)_")) %>%
  count(word, sort = TRUE) %>%
  filter(nchar(word) > 2) %>%
  rename(freq = n) %>%
  head(100)

wc_positif <- wordcloud2(
  data            = kata_positif,
  shape           = "square",
  size            = 1.2,
  color           = rep(c("#2C2C2A", "#444441", "#5F5E5A", "#888780", "#B4B2A9"), length.out = nrow(kata_positif)),
  backgroundColor = "white",
  fontFamily      = "Poppins",
  rotateRatio     = 0.2
)
tambah_judul(wc_positif, "Word Cloud — Positif")

Word Cloud — Positif

# Word Cloud — Negatif
kata_negatif <- df %>%
  filter(label_svm == "Negatif") %>%
  unnest_tokens(word, text_bersih) %>%
  filter(!word %in% stopwords_id) %>%
  filter(!str_detect(word, "^(tidak|tak|bukan|belum|jangan|ga|gak|nggak|ngga|enggak|ndak|kagak|gk)_")) %>%
  count(word, sort = TRUE) %>%
  filter(nchar(word) > 2) %>%
  rename(freq = n) %>%
  head(100)

wc_negatif <- wordcloud2(
  data            = kata_negatif,
  shape           = "square",
  size            = 0.6,
  color           = rep(c("#2C2C2A", "#444441", "#5F5E5A", "#888780", "#B4B2A9"), length.out = nrow(kata_negatif)),
  backgroundColor = "white",
  fontFamily      = "Poppins",
  rotateRatio     = 0.2
)
tambah_judul(wc_negatif, "Word Cloud — Negatif")

Word Cloud — Negatif

# Word Cloud — Netral
kata_netral <- df %>%
  filter(label_svm == "Netral") %>%
  unnest_tokens(word, text_bersih) %>%
  filter(!word %in% stopwords_id) %>%
  filter(!str_detect(word, "^(tidak|tak|bukan|belum|jangan|ga|gak|nggak|ngga|enggak|ndak|kagak|gk)_")) %>%
  count(word, sort = TRUE) %>%
  filter(nchar(word) > 2) %>%
  rename(freq = n) %>%
  head(100)

wc_netral <- wordcloud2(
  data            = kata_netral,
  shape           = "square",
  size            = 0.6,
  color           = rep(c("#2C2C2A", "#444441", "#5F5E5A", "#888780", "#B4B2A9"), length.out = nrow(kata_netral)),
  backgroundColor = "white",
  fontFamily      = "Poppins",
  rotateRatio     = 0.2
)
tambah_judul(wc_netral, "Word Cloud — Netral")

Word Cloud — Netral