# 1. Load library
library(ggplot2)
library(readr)
library(dplyr)
library(e1071)
library(tm)
library(caret)
library(caTools)
# 2. Baca data
data_sentimen <- read_csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/data_cleaning_optimal.csv")
## Rows: 933 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): processed_text, sentimen
## dttm (1): created_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3. visualisasi sentimen
sentimen_count <- data_sentimen %>% count(sentimen)
# Bar Chart
ggplot(sentimen_count, aes(x = sentimen, y = n, fill = sentimen)) +
geom_bar(stat = "identity") +
labs(title = "Distribusi Sentimen", x = "Kategori", y = "Jumlah Tweet") +
theme_minimal()

# Pie Chart
ggplot(sentimen_count, aes(x = "", y = n, fill = sentimen)) +
geom_col(width = 1) +
coord_polar(theta = "y") +
theme_void() +
labs(title = "Persentase Sentimen")

# 4. train & test data
set.seed(1861)
split <- sample.split(data_sentimen$sentimen, SplitRatio = 0.70)
data_train <- subset(data_sentimen, split == TRUE)
write.csv(data_train, file = "train_data2.csv", row.names = FALSE)
# Train Data #
train <- read.csv("train_data2.csv")
train <- train[,-1]
dim(train)
## [1] 652 2
# Test Data #
test <- read.csv("C:/Users/Acer/OneDrive - untirta.ac.id/Kuliah/MK/SEMESTER 5/Data Challenge/test_data.csv")
dim(test)
## [1] 280 2
# menggabungkan train dan test untuk mengubah teks menjadi angka
combined <- rbind(train, test)
corpus <- Corpus(VectorSource(combined$processed_text))
dtm <- DocumentTermMatrix(
corpus,
control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))
)
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
# memisahkan train dan test untuk model
dtm_train <- dtm[1:nrow(train), ]
dtm_test <- dtm[(nrow(train)+1):nrow(combined),]
latih <- as.matrix(dtm_train)
uji <- as.matrix(dtm_test)
#SVM model
svm_model <- svm(
x = latih,
y = as.factor(train$sentimen),
kernel = "linear",
probability = TRUE
)
## Warning in svm.default(x = latih, y = as.factor(train$sentimen), kernel =
## "linear", : Variable(s) 'buhay' and 'gobyerno' and 'habang' and 'kesal' and
## 'dibanggain' and 'hrsnya' and 'perfeksionisme' and 'akwoakwo' and 'cucu' and
## 'dri' and 'film' and 'macem' and 'nurutin' and 'part' and 'ton' and 'gelato'
## and 'ikut' and 'ndak' and 'presiden' and 'about' and 'daerah' and 'freeport'
## and 'kejam' and 'louder' and 'papua' and 'should' and 'somehow' and 'tanah' and
## 'aaccb' and 'ceed' and 'biktima' and 'jongkok' and 'literasi' and 'rilis' and
## 'dibiarin' and 'elah' and 'emphasis' and 'gabut' and 'have' and 'korup' and
## 'left' and 'letter' and 'ngebunuh' and 'nyelametin' and 'periksa' and 'with'
## and 'apel' and 'astagfirullahaladzim' and 'hadap' and 'pasteur' and 'puskes'
## and 'busa' and 'betapa' and 'busuk' and 'alot' and 'bosss' and 'buru' and
## 'jlek' and 'kasus' and 'blok' and 'enek' and 'unfriend' and 'much' and 'than'
## and 'worse' and 'itumah' and 'franz' and 'magnis' and 'marxisme' and 'botol'
## and 'nada' and 'ria' and 'tat' and 'tot' and 'urgensi' and 'trash' and 'valid'
## and 'dream' and 'guaranteed' and 'sweet' and 'ngeroko' and 'pick' and
## 'struggle' and 'too' and 'anthum' and 'busway' and 'faham' and 'kntol' and
## 'patroli' and 'always' and 'book' and 'carry' and 'khusus' and 'kosmetik' and
## 'lainnn' and 'nyampur' and 'ransel' and 'tangkal' and 'tari' and 'iii' and
## 'once' and 'wkwkwkwk' and 'being' and 'beloved' and 'bias' and 'cherry' and
## 'mom' and 'picking' and 'gelar' and 'belaka' and 'dengung' and 'gera' and
## 'gerak' and 'inspirasi' and 'pecah' and 'seasters' and 'sendat' and
## 'seremonial' and 'solid' and 'gih' and 'agitasi' and 'bareng' and 'dampak' and
## 'edukasi' and 'follower' and 'promosi' and 'provokasi' and 'slogan' and 'bae'
## and 'gawe' and 'terror' and 'basa' and 'besfriend' and 'lho' and 'ngajak' and
## 'ngopo' and 'nyesel' and 'generalisasi' and 'drtd' and 'puny' and 'sbaik' and
## 'terusss' and 'punyeta' and 'buah' and 'koplak' and 'sik' and 'haha' and 'ckck'
## and 'beranta' and 'kirain' and 'toyoda' and 'yuda' and 'buset' and 'ngamuk' and
## 'twistan' and 'luhk' and 'nying' and 'pis' and 'rem' and 'sintinkkk' and
## 'momen' and 'oalah' and 'version' and 'double' and 'halang' and 'job' and
## 'alumni' and 'pulak' and 'sejuk' and 'selera' and 'bogor' and 'isang' and
## 'kabila' and 'labas' and 'pasok' and 'tenga' and 'anjinkkkk' and 'bgttt' and
## 'tampan' and 'oops' and 'cicil' and 'gabakal' and 'jagain' and 'goblog' and
## 'hadeeee' and 'akumulasi' and 'birokratik' and 'birokratiknya' and 'finansial'
## and 'gunain' and 'intel' and 'kapitalis' and 'kursi' and 'otoritas' and 'jatah'
## and 'mamah' and 'nolak' and 'gass' and 'layout' and 'hein' and 'hapus' and
## 'idealisme' and 'kaum' and 'kontak' and 'mendagrinya' constant. Cannot scale
## data.
prediksi_Sentimen <- predict(svm_model, uji, probability = TRUE)
hasilAkhir <- data.frame(
text = test$processed_text,
sentimen_asli = test$sentimen,
prediksi = prediksi_Sentimen
)
# confusion Matrix
pred <- factor(prediksi_Sentimen,
levels = c("negatif", "netral", "positif"))
actual <- factor(test$sentimen,
levels = c("negatif", "netral", "positif"))
confusionMatrix(pred, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction negatif netral positif
## negatif 220 15 0
## netral 0 40 1
## positif 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.9429
## 95% CI : (0.9089, 0.967)
## No Information Rate : 0.7857
## P-Value [Acc > NIR] : 2.492e-13
##
## Kappa : 0.8166
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: negatif Class: netral Class: positif
## Sensitivity 1.0000 0.7273 0.80000
## Specificity 0.7500 0.9956 1.00000
## Pos Pred Value 0.9362 0.9756 1.00000
## Neg Pred Value 1.0000 0.9372 0.99638
## Prevalence 0.7857 0.1964 0.01786
## Detection Rate 0.7857 0.1429 0.01429
## Detection Prevalence 0.8393 0.1464 0.01429
## Balanced Accuracy 0.8750 0.8614 0.90000
# cara 2
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 4.3.3
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 4.3.3
training_codes = combined$sentimen
container <- create_container(dtm,
t(training_codes),
trainSize = 1:nrow(train),
testSize= (nrow(train)+1):nrow(combined),
virgin = FALSE)
models <- train_models(container, algorithms = "SVM", kernel= "linear")
results <- classify_models(container, models)
hasil_linear <- data.frame(
text = test$processed_text,
sentimen_asli = test$sentimen,
prediksi = results$SVM_LABEL
)
# Confusion Matrix
level_sentimen <- c("negatif", "netral", "positif")
actual <- factor(test$sentimen, levels = level_sentimen)
pred_sentimen <- factor(results$SVM_LABEL, levels = level_sentimen)
conf_matrix_linear <- confusionMatrix(pred_sentimen, actual)
conf_matrix_linear
## Confusion Matrix and Statistics
##
## Reference
## Prediction negatif netral positif
## negatif 219 0 0
## netral 1 55 1
## positif 0 0 4
##
## Overall Statistics
##
## Accuracy : 0.9929
## 95% CI : (0.9744, 0.9991)
## No Information Rate : 0.7857
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9793
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: negatif Class: netral Class: positif
## Sensitivity 0.9955 1.0000 0.80000
## Specificity 1.0000 0.9911 1.00000
## Pos Pred Value 1.0000 0.9649 1.00000
## Neg Pred Value 0.9836 1.0000 0.99638
## Prevalence 0.7857 0.1964 0.01786
## Detection Rate 0.7821 0.1964 0.01429
## Detection Prevalence 0.7821 0.2036 0.01429
## Balanced Accuracy 0.9977 0.9956 0.90000
# Perhitungan Metrik Evaluasi Model
evaluasi_modelKlasif <- function(prediksi, aktual) {
cm <- confusionMatrix(prediksi, aktual)
akurasi <- cm$overall["Accuracy"]
precision <- cm$byClass[,"Precision"]
recall <- cm$byClass[,"Recall"]
f1 <- cm$byClass[,"F1"]
hasil_eval <- data.frame(
Precision = round(precision, 2),
Recall = round(recall, 2),
F1_score = round(f1, 2),
Accuracy = round(akurasi, 2)
)
return(hasil_eval)
}
hasil_evaluasi_matriks <- evaluasi_modelKlasif(pred_sentimen, actual)
hasil_evaluasi_matriks
## Precision Recall F1_score Accuracy
## Class: negatif 1.00 1.0 1.00 0.99
## Class: netral 0.96 1.0 0.98 0.99
## Class: positif 1.00 0.8 0.89 0.99