# 1. Load library
library(ggplot2)
library(readr)
library(dplyr)
library(e1071)
library(tm)
library(caret)
library(caTools)
library(RTextTools)
library(wordcloud2)
# 2. Baca data
data_sentimen <- read_csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/data_cleaning_optimal.csv")
## Rows: 933 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): processed_text, sentimen
## dttm (1): created_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3. visualisasi sentimen
sentimen_count <- data_sentimen %>% count(sentimen)
# Bar Chart
ggplot(sentimen_count, aes(x = sentimen, y = n, fill = sentimen)) +
geom_bar(stat = "identity") +
labs(title = "Distribusi Sentimen", x = "Kategori", y = "Jumlah Tweet") +
theme_minimal()

# Pie Chart
ggplot(sentimen_count, aes(x = "", y = n, fill = sentimen)) +
geom_col(width = 1) +
coord_polar(theta = "y") +
theme_void() +
labs(title = "Persentase Sentimen")

# 4. train & test data
set.seed(1861)
split <- sample.split(data_sentimen$sentimen, SplitRatio = 0.70)
data_train <- subset(data_sentimen, split == TRUE)
write.csv(data_train, file = "train_data2.csv", row.names = FALSE)
# Train Data #
train <- read.csv("train_data2.csv")
train <- train[,-1]
dim(train)
## [1] 652 2
# Test Data #
test <- read.csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/test_data.csv")
dim(test)
## [1] 280 2
# menggabungkan train dan test untuk mengubah teks menjadi angka
combined <- rbind(train, test)
corpus <- Corpus(VectorSource(combined$processed_text))
dtm <- DocumentTermMatrix(
corpus,
control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))
)
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
# 5. Pemeodelan
training_codes = combined$sentimen
container <- create_container(dtm,
t(training_codes),
trainSize = 1:nrow(train),
testSize= (nrow(train)+1):nrow(combined),
virgin = FALSE)
models <- train_models(container, algorithms = "SVM", kernel= "linear")
results <- classify_models(container, models)
hasil_linear <- data.frame(
text = test$processed_text,
sentimen_asli = test$sentimen,
prediksi = results$SVM_LABEL
)
# Confusion Matrix
level_sentimen <- c("negatif", "netral", "positif")
actual <- factor(test$sentimen, levels = level_sentimen)
pred_sentimen <- factor(results$SVM_LABEL, levels = level_sentimen)
conf_matrix_linear <- confusionMatrix(pred_sentimen, actual)
conf_matrix_linear
## Confusion Matrix and Statistics
##
## Reference
## Prediction negatif netral positif
## negatif 219 0 0
## netral 1 55 1
## positif 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.9929
## 95% CI : (0.9744, 0.9991)
## No Information Rate : 0.7857
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9793
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: negatif Class: netral Class: positif
## Sensitivity 0.9955 1.0000 0.80000
## Specificity 1.0000 0.9911 1.00000
## Pos Pred Value 1.0000 0.9649 1.00000
## Neg Pred Value 0.9836 1.0000 0.99638
## Prevalence 0.7857 0.1964 0.01786
## Detection Rate 0.7821 0.1964 0.01429
## Detection Prevalence 0.7821 0.2036 0.01429
## Balanced Accuracy 0.9977 0.9956 0.90000
# 6. Perhitungan Metrik Evaluasi Model
evaluasi_modelKlasif <- function(prediksi, aktual) {
cm <- confusionMatrix(prediksi, aktual)
akurasi <- cm$overall["Accuracy"]
precision <- cm$byClass[,"Precision"]
recall <- cm$byClass[,"Recall"]
f1 <- cm$byClass[,"F1"]
hasil_eval <- data.frame(
Precision = round(precision, 2),
Recall = round(recall, 2),
F1_score = round(f1, 2),
Accuracy = round(akurasi, 2)
)
return(hasil_eval)
}
hasil_evaluasi_matriks <- evaluasi_modelKlasif(pred_sentimen, actual)
hasil_evaluasi_matriks
## Precision Recall F1_score Accuracy
## Class: negatif 1.00 1.0 1.00 0.99
## Class: netral 0.96 1.0 0.98 0.99
## Class: positif 1.00 0.8 0.89 0.99
# Visualisasi word cloud
wc <- function(data, sentimen_label) {
teks_sentimen <- data_sentimen %>% filter(sentimen == sentimen_label) %>% pull(processed_text)
corpus <- Corpus(VectorSource(teks_sentimen))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
kata <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(kata), freq = kata)
set.seed(18)
wordcloud2(
data = d,
size = 0.5,
shape = 'circle',
color = "random-dark",
backgroundColor = "white"
)
}
wc(data_sentimen, "positif")
wc(data_sentimen, "netral")
wc(data_sentimen, "negatif")