# 1. Load library
library(ggplot2)
library(readr)
library(dplyr)
library(e1071)
library(tm)
library(caret)
library(caTools)
# 2. Baca data
data_sentimen <- read_csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/data_cleaning_optimal.csv")
## Rows: 933 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): processed_text, sentimen
## dttm (1): created_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3. visualisasi sentimen
sentimen_count <- data_sentimen %>% count(sentimen)

# Bar Chart
ggplot(sentimen_count, aes(x = sentimen, y = n, fill = sentimen)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribusi Sentimen", x = "Kategori", y = "Jumlah Tweet") +
  theme_minimal()

# Pie Chart
ggplot(sentimen_count, aes(x = "", y = n, fill = sentimen)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "Persentase Sentimen")

# 4. train & test data
set.seed(1861)
split <- sample.split(data_sentimen$sentimen, SplitRatio = 0.70)
data_train <- subset(data_sentimen, split == TRUE)
write.csv(data_train, file = "train_data2.csv", row.names = FALSE)

# Train Data #
train <- read.csv("train_data2.csv")
train <- train[,-1]
dim(train)
## [1] 652   2
# Test Data #
test <- read.csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/test_data.csv")
dim(test)
## [1] 280   2
# menggabungkan train dan test untuk mengubah teks menjadi angka
combined <- rbind(train, test)
corpus <- Corpus(VectorSource(combined$processed_text))

dtm <- DocumentTermMatrix(
  corpus,
  control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))
)
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
# memisahkan train dan test untuk model
dtm_train <- dtm[1:nrow(train), ]
dtm_test  <- dtm[(nrow(train)+1):nrow(combined),]

latih <- as.matrix(dtm_train)
uji <- as.matrix(dtm_test)

#SVM model
svm_model <- svm(
  x = latih,
  y = as.factor(train$sentimen),
  kernel = "linear",
  probability = TRUE
)
## Warning in svm.default(x = latih, y = as.factor(train$sentimen), kernel =
## "linear", : Variable(s) 'buhay' and 'gobyerno' and 'habang' and 'kesal' and
## 'dibanggain' and 'hrsnya' and 'perfeksionisme' and 'akwoakwo' and 'cucu' and
## 'dri' and 'film' and 'macem' and 'nurutin' and 'part' and 'ton' and 'gelato'
## and 'ikut' and 'ndak' and 'presiden' and 'about' and 'daerah' and 'freeport'
## and 'kejam' and 'louder' and 'papua' and 'should' and 'somehow' and 'tanah' and
## 'aaccb' and 'ceed' and 'biktima' and 'jongkok' and 'literasi' and 'rilis' and
## 'dibiarin' and 'elah' and 'emphasis' and 'gabut' and 'have' and 'korup' and
## 'left' and 'letter' and 'ngebunuh' and 'nyelametin' and 'periksa' and 'with'
## and 'apel' and 'astagfirullahaladzim' and 'hadap' and 'pasteur' and 'puskes'
## and 'busa' and 'betapa' and 'busuk' and 'alot' and 'bosss' and 'buru' and
## 'jlek' and 'kasus' and 'blok' and 'enek' and 'unfriend' and 'much' and 'than'
## and 'worse' and 'itumah' and 'franz' and 'magnis' and 'marxisme' and 'botol'
## and 'nada' and 'ria' and 'tat' and 'tot' and 'urgensi' and 'trash' and 'valid'
## and 'dream' and 'guaranteed' and 'sweet' and 'ngeroko' and 'pick' and
## 'struggle' and 'too' and 'anthum' and 'busway' and 'faham' and 'kntol' and
## 'patroli' and 'always' and 'book' and 'carry' and 'khusus' and 'kosmetik' and
## 'lainnn' and 'nyampur' and 'ransel' and 'tangkal' and 'tari' and 'iii' and
## 'once' and 'wkwkwkwk' and 'being' and 'beloved' and 'bias' and 'cherry' and
## 'mom' and 'picking' and 'gelar' and 'belaka' and 'dengung' and 'gera' and
## 'gerak' and 'inspirasi' and 'pecah' and 'seasters' and 'sendat' and
## 'seremonial' and 'solid' and 'gih' and 'agitasi' and 'bareng' and 'dampak' and
## 'edukasi' and 'follower' and 'promosi' and 'provokasi' and 'slogan' and 'bae'
## and 'gawe' and 'terror' and 'basa' and 'besfriend' and 'lho' and 'ngajak' and
## 'ngopo' and 'nyesel' and 'generalisasi' and 'drtd' and 'puny' and 'sbaik' and
## 'terusss' and 'punyeta' and 'buah' and 'koplak' and 'sik' and 'haha' and 'ckck'
## and 'beranta' and 'kirain' and 'toyoda' and 'yuda' and 'buset' and 'ngamuk' and
## 'twistan' and 'luhk' and 'nying' and 'pis' and 'rem' and 'sintinkkk' and
## 'momen' and 'oalah' and 'version' and 'double' and 'halang' and 'job' and
## 'alumni' and 'pulak' and 'sejuk' and 'selera' and 'bogor' and 'isang' and
## 'kabila' and 'labas' and 'pasok' and 'tenga' and 'anjinkkkk' and 'bgttt' and
## 'tampan' and 'oops' and 'cicil' and 'gabakal' and 'jagain' and 'goblog' and
## 'hadeeee' and 'akumulasi' and 'birokratik' and 'birokratiknya' and 'finansial'
## and 'gunain' and 'intel' and 'kapitalis' and 'kursi' and 'otoritas' and 'jatah'
## and 'mamah' and 'nolak' and 'gass' and 'layout' and 'hein' and 'hapus' and
## 'idealisme' and 'kaum' and 'kontak' and 'mendagrinya' constant. Cannot scale
## data.
prediksi_Sentimen <- predict(svm_model, uji, probability = TRUE)
hasilAkhir <- data.frame(
  text = test$processed_text,
  sentimen_asli = test$sentimen,
  prediksi = prediksi_Sentimen
)

# confusion Matrix
pred <- factor(prediksi_Sentimen, 
               levels = c("negatif", "netral", "positif"))

actual <- factor(test$sentimen, 
                 levels = c("negatif", "netral", "positif"))
confusionMatrix(pred, actual)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negatif netral positif
##    negatif     220     15       0
##    netral        0     40       1
##    positif       0      0       4
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9429         
##                  95% CI : (0.9089, 0.967)
##     No Information Rate : 0.7857         
##     P-Value [Acc > NIR] : 2.492e-13      
##                                          
##                   Kappa : 0.8166         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: negatif Class: netral Class: positif
## Sensitivity                  1.0000        0.7273        0.80000
## Specificity                  0.7500        0.9956        1.00000
## Pos Pred Value               0.9362        0.9756        1.00000
## Neg Pred Value               1.0000        0.9372        0.99638
## Prevalence                   0.7857        0.1964        0.01786
## Detection Rate               0.7857        0.1429        0.01429
## Detection Prevalence         0.8393        0.1464        0.01429
## Balanced Accuracy            0.8750        0.8614        0.90000
# cara 2
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 4.3.3
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 4.3.3
training_codes = combined$sentimen
container <- create_container(dtm,
                              t(training_codes),
                              trainSize = 1:nrow(train),
                              testSize= (nrow(train)+1):nrow(combined),
                              virgin = FALSE)

models <- train_models(container, algorithms = "SVM", kernel= "linear")
results <- classify_models(container, models)

hasil_linear <- data.frame(
  text = test$processed_text,
  sentimen_asli = test$sentimen,
  prediksi = results$SVM_LABEL
)

# Confusion Matrix
level_sentimen <- c("negatif", "netral", "positif")

actual <- factor(test$sentimen, levels = level_sentimen)
pred_sentimen <- factor(results$SVM_LABEL, levels = level_sentimen)

conf_matrix_linear <- confusionMatrix(pred_sentimen, actual)
conf_matrix_linear
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negatif netral positif
##    negatif     219      0       0
##    netral        1     55       1
##    positif       0      0       4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9929          
##                  95% CI : (0.9744, 0.9991)
##     No Information Rate : 0.7857          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9793          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: negatif Class: netral Class: positif
## Sensitivity                  0.9955        1.0000        0.80000
## Specificity                  1.0000        0.9911        1.00000
## Pos Pred Value               1.0000        0.9649        1.00000
## Neg Pred Value               0.9836        1.0000        0.99638
## Prevalence                   0.7857        0.1964        0.01786
## Detection Rate               0.7821        0.1964        0.01429
## Detection Prevalence         0.7821        0.2036        0.01429
## Balanced Accuracy            0.9977        0.9956        0.90000
# Perhitungan Metrik Evaluasi Model
evaluasi_modelKlasif <- function(prediksi, aktual) {
  cm <- confusionMatrix(prediksi, aktual)
  akurasi  <- cm$overall["Accuracy"]
  precision <- cm$byClass[,"Precision"]
  recall    <- cm$byClass[,"Recall"]
  f1        <- cm$byClass[,"F1"]
  
hasil_eval <- data.frame(
  Precision = round(precision, 2),
  Recall = round(recall, 2),
  F1_score = round(f1, 2),
  Accuracy = round(akurasi, 2)
)
  return(hasil_eval)
}

hasil_evaluasi_matriks <- evaluasi_modelKlasif(pred_sentimen, actual)
hasil_evaluasi_matriks
##                Precision Recall F1_score Accuracy
## Class: negatif      1.00    1.0     1.00     0.99
## Class: netral       0.96    1.0     0.98     0.99
## Class: positif      1.00    0.8     0.89     0.99