digunakan data sekunder berasal dari review hotel yang hanya memiliki label positif dan negatif.
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)## Loading required package: NLP
library(SnowballC)
library(e1071)
library(caret)## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
library(purrr)##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
hotel <- read.csv("datasekunder_general.csv")
hotelhotel <- na.omit(hotel)
hotel <- hotel %>%
mutate(labels = as.factor(labels))corpus <- VCorpus(VectorSource(hotel$text))
corpus[[10]]$content## [1] "Tempatnya nyaman, pelayanan ramah banget. Recommended banget tempatnya."
Menghilangkan Number
corpus_cl <- tm_map(x = corpus, FUN = removeNumbers)
corpus_cl[[10]]$content## [1] "Tempatnya nyaman, pelayanan ramah banget. Recommended banget tempatnya."
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(tolower))Mengubah slang word menjadi kata baku
# Read your CSV file into a data frame
# Read the CSV file containing the slang word mappings
slang_data <- read.csv("new_kamusalay.csv", header = FALSE, stringsAsFactors = FALSE)
# Create a mapping from slang words to their meanings
slang_mapping <- setNames(slang_data$V2, slang_data$V1)
# Create a function to replace slang words with their meanings
replace_slang <- function(text) {
words <- unlist(strsplit(text, " ")) # Split text into words
replaced_words <- sapply(words, function(word) {
if (word %in% names(slang_mapping)) {
return(slang_mapping[word])
} else {
return(word)
}
})
return(paste(replaced_words, collapse = " "))
}corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(replace_slang))
corpus_cl[[10]]$content## [1] "tempatnya nyaman, pelayanan ramah banget. recommended banget tempatnya."
Remove Stopword
stopwords_id <- tolower(readLines("stopwords_id.txt"))## Warning in readLines("stopwords_id.txt"): incomplete final line found on
## 'stopwords_id.txt'
corpus_cl[[9]]$content## [1] "banyak kecoa nya"
corpus_cl <- tm_map(x = corpus_cl, FUN = removeWords, stopwords_id)
corpus_cl[[10]]$content## [1] "tempatnya nyaman, pelayanan ramah banget. recommended banget tempatnya."
Remove Punctuation
corpus_cl <- tm_map(x = corpus_cl, FUN = removePunctuation)
corpus_cl[[10]]$content## [1] "tempatnya nyaman pelayanan ramah banget recommended banget tempatnya"
Stemming
library(koRpus)## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:tm':
##
## readTagged
library(tm)
library(katadasaR)library(tokenizers)## Warning: package 'tokenizers' was built under R version 4.3.1
library(katadasaR)
stem_katadasaR <- content_transformer(function(x) {
paste(sapply(unlist(tokenizers::tokenize_words(x)), katadasaR::katadasaR), collapse = ' ')
})
corpus_cl <- tm_map(corpus_cl, stem_katadasaR)corpus_cl[[10]]$content## [1] "tempat nyaman ayan ramah banget recommended banget tempat"
Remove Whitespaces
corpus_cl <- tm_map(x = corpus_cl, FUN = stripWhitespace)
corpus_cl[[10]]$content## [1] "tempat nyaman ayan ramah banget recommended banget tempat"
Dokumen Term Matriks
hotel_dtm <- DocumentTermMatrix(corpus_cl)
# cek singkat struktur dtm
inspect(hotel_dtm)## <<DocumentTermMatrix (documents: 11142, terms: 9139)>>
## Non-/sparse entries: 105867/101720871
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs air airy ayan baik bersih hotel kamar kotor kurang mandi
## 11094 1 1 0 0 0 0 2 0 1 0
## 1258 0 0 0 0 0 0 0 0 0 0
## 2321 0 0 0 0 0 1 0 0 0 0
## 2951 0 0 0 0 0 0 0 0 0 0
## 3231 0 0 0 0 0 2 0 0 0 0
## 3936 0 1 0 0 0 0 0 0 0 0
## 4867 0 1 0 0 0 0 2 0 0 1
## 5324 0 2 0 0 0 1 0 0 0 0
## 6455 0 3 0 0 0 4 0 0 0 0
## 6713 0 0 0 0 0 0 0 0 0 0
library(glue)
library(ggplot2)
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Sorting frekuensi kemunculan secara descending
text_hotel <- as.matrix(hotel_dtm)
hotel_list <- sort(colSums(text_hotel), decreasing = T)
# Membuat data ke bentuk dataframe untuk kebutuhan visualisasi
hotel_df <- data.frame(word = names(hotel_list), freq=hotel_list)
hotel_df <- hotel_df %>%
mutate(label = glue(
"Frekuensi: {freq}"))library(wordcloud2)
library(RColorBrewer)
colors.wc <- brewer.pal(8, "Dark2")
wordcloud2(hotel_df, size = 1.5)plot <- ggplot(head(hotel_df,7), aes(y = reorder(word,freq), x = freq)) +
geom_col(aes(fill = freq, text = label), show.legend = F) +
labs(x = "Frekuensi",
y = "Terms/Kata",
title = "Frekuensi Kata Tertinggi") +
#scale_x_continuous(labels = "kata") +
scale_fill_gradient(low = "#85c946", high = "#304919") +
theme_minimal() +
theme(axis.text.y = element_text(face = "bold", size = 11))## Warning in geom_col(aes(fill = freq, text = label), show.legend = F): Ignoring
## unknown aesthetics: text
ggplotly(plot, tooltip = "text")RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
# train-test splitting
index <- sample(nrow(hotel_dtm), nrow(hotel_dtm)*0.75)
# code here
hotel_train <- hotel_dtm[index,]
hotel_test <- hotel_dtm[-index,]Akan dilakukan pembuatan model terhadap untuk setiap variabel target
Memilih target variable (general)
train_labels_gen <- hotel[index,2]
test_labels_gen <- hotel[-index,2]Cek label
check ham and spam message distribution in train and test set
prop.table(table(train_labels_gen))## train_labels_gen
## 0 1
## 0.8652465 0.1347535
prop.table(table(test_labels_gen))## test_labels_gen
## 0 1
## 0.8610912 0.1389088
Data Preparation lanjutan
Menghilangkan kata jarang muncul
print(dim(hotel_train))## [1] 8356 9139
Prediktor memiliki 3493 kata, dilakukan penghapusan kata yang jarang muncul (dibawah 4)
freq <- findFreqTerms(hotel_train,lowfreq = 10)
length(freq)## [1] 900
hotel_train_gen_filter <- hotel_train[,freq]
inspect(hotel_train_gen_filter)## <<DocumentTermMatrix (documents: 8356, terms: 900)>>
## Non-/sparse entries: 67536/7452864
## Sparsity : 99%
## Maximal term length: 13
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs air airy ayan baik bau bersih hotel kamar kurang mandi
## 10633 2 0 0 0 0 0 2 2 1 1
## 11094 1 1 0 0 0 0 0 2 1 0
## 11108 0 0 0 0 2 0 0 3 0 1
## 3307 0 1 0 0 0 1 1 4 0 1
## 3936 0 1 0 0 0 0 0 0 0 0
## 460 0 0 0 0 0 1 0 1 0 0
## 5413 1 1 0 0 2 0 0 1 0 0
## 572 0 0 0 0 0 0 0 5 0 1
## 5871 1 0 0 0 0 0 0 2 2 0
## 7135 0 1 0 0 0 0 0 2 0 1
Mengurangi data imbalance dengan SMOTE
library(smotefamily)## Warning: package 'smotefamily' was built under R version 4.3.1
hotel_train_gen_df <- as.data.frame(as.matrix(hotel_train_gen_filter))
train_smote_gen <- SMOTE(X = hotel_train_gen_df,
target = train_labels_gen,
dup_size = 5)
train_smote_gen <- train_smote_gen$data # extract only the balanced dataset
train_smote_gen$class <- as.factor(train_smote_gen$class)
prop.table(table(train_smote_gen$class))##
## 0 1
## 0.5169455 0.4830545
hotel_test_df <- as.data.frame(as.matrix(hotel_test))
naive_model <- naiveBayes(class ~.,
data = train_smote_gen,
laplace = 1)general_predict_bayes <- predict(naive_model, hotel_test_df)confusionMatrix(general_predict_bayes, test_labels_gen)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 296 15
## 1 2103 372
##
## Accuracy : 0.2398
## 95% CI : (0.224, 0.2561)
## No Information Rate : 0.8611
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0259
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.1234
## Specificity : 0.9612
## Pos Pred Value : 0.9518
## Neg Pred Value : 0.1503
## Prevalence : 0.8611
## Detection Rate : 0.1062
## Detection Prevalence : 0.1116
## Balanced Accuracy : 0.5423
##
## 'Positive' Class : 0
##
library(partykit)## Warning: package 'partykit' was built under R version 4.3.1
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 4.3.1
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.1
library(randomForest)## Warning: package 'randomForest' was built under R version 4.3.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# forest_train_gen <- train_smote_gen %>%
# select(-class)
# forest_label_gen <- unlist(train_smote_gen %>% select(class))
# forest_gen_smote <- randomForest( x = forest_train_gen,
# y = forest_label_gen,
# ntree = 500)#saveRDS(forest_gen_smote, "forest_general_posneg.rds")
forest_gen_smote <- readRDS("forest_general_posneg.rds")general_predict <- predict(forest_gen_smote, hotel_test_df)confusionMatrix(general_predict, test_labels_gen)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2308 150
## 1 91 237
##
## Accuracy : 0.9135
## 95% CI : (0.9024, 0.9237)
## No Information Rate : 0.8611
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6137
##
## Mcnemar's Test P-Value : 0.0001869
##
## Sensitivity : 0.9621
## Specificity : 0.6124
## Pos Pred Value : 0.9390
## Neg Pred Value : 0.7226
## Prevalence : 0.8611
## Detection Rate : 0.8284
## Detection Prevalence : 0.8823
## Balanced Accuracy : 0.7872
##
## 'Positive' Class : 0
##
library(e1071)
#svm_model <- svm(class ~ ., data = train_smote_gen, kernel = "linear")#saveRDS(svm_model, "forest_general_svm.rds")
svm_model <- readRDS("forest_general_svm.rds")svm_predict <- predict(svm_model, hotel_test_df)confusionMatrix(svm_predict, test_labels_gen)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2137 108
## 1 262 279
##
## Accuracy : 0.8672
## 95% CI : (0.854, 0.8796)
## No Information Rate : 0.8611
## P-Value [Acc > NIR] : 0.1833
##
## Kappa : 0.5242
##
## Mcnemar's Test P-Value : 1.805e-15
##
## Sensitivity : 0.8908
## Specificity : 0.7209
## Pos Pred Value : 0.9519
## Neg Pred Value : 0.5157
## Prevalence : 0.8611
## Detection Rate : 0.7670
## Detection Prevalence : 0.8058
## Balanced Accuracy : 0.8059
##
## 'Positive' Class : 0
##