# 1. Load library
library(ggplot2)
library(readr)
library(dplyr)
library(e1071)
library(tm)
library(caret)
library(caTools)
library(RTextTools)
library(wordcloud2)
# 2. Baca data
data_sentimen <- read_csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/data_cleaning_optimal.csv")
## Rows: 933 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): processed_text, sentimen
## dttm (1): created_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3. visualisasi sentimen
sentimen_count <- data_sentimen %>% count(sentimen)

# Bar Chart
ggplot(sentimen_count, aes(x = sentimen, y = n, fill = sentimen)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribusi Sentimen", x = "Kategori", y = "Jumlah Tweet") +
  theme_minimal()

# Pie Chart
ggplot(sentimen_count, aes(x = "", y = n, fill = sentimen)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "Persentase Sentimen")

# 4. train & test data
set.seed(1861)
split <- sample.split(data_sentimen$sentimen, SplitRatio = 0.70)
data_train <- subset(data_sentimen, split == TRUE)
write.csv(data_train, file = "train_data2.csv", row.names = FALSE)

# Train Data #
train <- read.csv("train_data2.csv")
train <- train[,-1]
dim(train)
## [1] 652   2
# Test Data #
test <- read.csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/test_data.csv")
dim(test)
## [1] 280   2
# menggabungkan train dan test untuk mengubah teks menjadi angka
combined <- rbind(train, test)
corpus <- Corpus(VectorSource(combined$processed_text))

dtm <- DocumentTermMatrix(
  corpus,
  control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))
)
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
# 5. Pemeodelan
training_codes = combined$sentimen
container <- create_container(dtm,
                              t(training_codes),
                              trainSize = 1:nrow(train),
                              testSize= (nrow(train)+1):nrow(combined),
                              virgin = FALSE)

models <- train_models(container, algorithms = "SVM", kernel= "linear")
results <- classify_models(container, models)

hasil_linear <- data.frame(
  text = test$processed_text,
  sentimen_asli = test$sentimen,
  prediksi = results$SVM_LABEL
)

# Confusion Matrix
level_sentimen <- c("negatif", "netral", "positif")

actual <- factor(test$sentimen, levels = level_sentimen)
pred_sentimen <- factor(results$SVM_LABEL, levels = level_sentimen)

conf_matrix_linear <- confusionMatrix(pred_sentimen, actual)
conf_matrix_linear
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negatif netral positif
##    negatif     219      0       0
##    netral        1     55       1
##    positif       0      0       4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9929          
##                  95% CI : (0.9744, 0.9991)
##     No Information Rate : 0.7857          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9793          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: negatif Class: netral Class: positif
## Sensitivity                  0.9955        1.0000        0.80000
## Specificity                  1.0000        0.9911        1.00000
## Pos Pred Value               1.0000        0.9649        1.00000
## Neg Pred Value               0.9836        1.0000        0.99638
## Prevalence                   0.7857        0.1964        0.01786
## Detection Rate               0.7821        0.1964        0.01429
## Detection Prevalence         0.7821        0.2036        0.01429
## Balanced Accuracy            0.9977        0.9956        0.90000
# 6. Perhitungan Metrik Evaluasi Model
evaluasi_modelKlasif <- function(prediksi, aktual) {
  cm <- confusionMatrix(prediksi, aktual)
  akurasi  <- cm$overall["Accuracy"]
  precision <- cm$byClass[,"Precision"]
  recall    <- cm$byClass[,"Recall"]
  f1        <- cm$byClass[,"F1"]
  
hasil_eval <- data.frame(
  Precision = round(precision, 2),
  Recall = round(recall, 2),
  F1_score = round(f1, 2),
  Accuracy = round(akurasi, 2)
)
  return(hasil_eval)
}

hasil_evaluasi_matriks <- evaluasi_modelKlasif(pred_sentimen, actual)
hasil_evaluasi_matriks
##                Precision Recall F1_score Accuracy
## Class: negatif      1.00    1.0     1.00     0.99
## Class: netral       0.96    1.0     0.98     0.99
## Class: positif      1.00    0.8     0.89     0.99
# Visualisasi word cloud

wc <- function(data, sentimen_label) {
  teks_sentimen <- data_sentimen %>% filter(sentimen == sentimen_label) %>% pull(processed_text)
  corpus <- Corpus(VectorSource(teks_sentimen))
  dtm <- TermDocumentMatrix(corpus)
  m <- as.matrix(dtm)
  kata <- sort(rowSums(m), decreasing = TRUE)
  d <- data.frame(word = names(kata), freq = kata)
  
set.seed(18)
wordcloud2(
  data = d,
  size = 0.5, 
  shape = 'circle',
  color = "random-dark",
  backgroundColor = "white"
)
}
wc(data_sentimen, "positif")
wc(data_sentimen, "netral")
wc(data_sentimen, "negatif")