1 Model general : pos-neg

digunakan data sekunder berasal dari review hotel yang hanya memiliki label positif dan negatif.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(SnowballC)
library(e1071)
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Loading required package: lattice
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift
hotel <- read.csv("datasekunder_general.csv")
hotel

2 Preparation Data

hotel <- na.omit(hotel)
hotel <- hotel %>% 
  mutate(labels = as.factor(labels))

3 Data Cleansing

corpus <- VCorpus(VectorSource(hotel$text))
corpus[[10]]$content
## [1] "Tempatnya nyaman, pelayanan ramah banget. Recommended banget tempatnya."

Menghilangkan Number

corpus_cl <- tm_map(x = corpus, FUN = removeNumbers)

corpus_cl[[10]]$content
## [1] "Tempatnya nyaman, pelayanan ramah banget. Recommended banget tempatnya."
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(tolower))

Mengubah slang word menjadi kata baku

# Read your CSV file into a data frame
# Read the CSV file containing the slang word mappings
slang_data <- read.csv("new_kamusalay.csv", header = FALSE, stringsAsFactors = FALSE)

# Create a mapping from slang words to their meanings
slang_mapping <- setNames(slang_data$V2, slang_data$V1)

# Create a function to replace slang words with their meanings
replace_slang <- function(text) {
  words <- unlist(strsplit(text, " "))  # Split text into words
  replaced_words <- sapply(words, function(word) {
    if (word %in% names(slang_mapping)) {
      return(slang_mapping[word])
    } else {
      return(word)
    }
  })
  return(paste(replaced_words, collapse = " "))
}
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(replace_slang))

corpus_cl[[10]]$content
## [1] "tempatnya nyaman, pelayanan ramah banget. recommended banget tempatnya."

Remove Stopword

stopwords_id <- tolower(readLines("stopwords_id.txt"))
## Warning in readLines("stopwords_id.txt"): incomplete final line found on
## 'stopwords_id.txt'
corpus_cl[[9]]$content
## [1] "banyak kecoa nya"
corpus_cl <- tm_map(x = corpus_cl, FUN = removeWords, stopwords_id)

corpus_cl[[10]]$content
## [1] "tempatnya nyaman, pelayanan ramah banget. recommended banget tempatnya."

Remove Punctuation

corpus_cl <- tm_map(x = corpus_cl, FUN = removePunctuation)
corpus_cl[[10]]$content
## [1] "tempatnya nyaman pelayanan ramah banget recommended banget tempatnya"

Stemming

library(koRpus)
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:tm':
## 
##     readTagged
library(tm)
library(katadasaR)
library(tokenizers)
## Warning: package 'tokenizers' was built under R version 4.3.1
library(katadasaR)
stem_katadasaR <- content_transformer(function(x) {
  paste(sapply(unlist(tokenizers::tokenize_words(x)), katadasaR::katadasaR), collapse = ' ')
})

corpus_cl <- tm_map(corpus_cl, stem_katadasaR)
corpus_cl[[10]]$content
## [1] "tempat nyaman ayan ramah banget recommended banget tempat"

Remove Whitespaces

corpus_cl <- tm_map(x = corpus_cl, FUN = stripWhitespace)

corpus_cl[[10]]$content
## [1] "tempat nyaman ayan ramah banget recommended banget tempat"

Dokumen Term Matriks

hotel_dtm <- DocumentTermMatrix(corpus_cl)

# cek singkat struktur dtm
inspect(hotel_dtm)
## <<DocumentTermMatrix (documents: 11142, terms: 9139)>>
## Non-/sparse entries: 105867/101720871
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    air airy ayan baik bersih hotel kamar kotor kurang mandi
##   11094   1    1    0    0      0     0     2     0      1     0
##   1258    0    0    0    0      0     0     0     0      0     0
##   2321    0    0    0    0      0     1     0     0      0     0
##   2951    0    0    0    0      0     0     0     0      0     0
##   3231    0    0    0    0      0     2     0     0      0     0
##   3936    0    1    0    0      0     0     0     0      0     0
##   4867    0    1    0    0      0     0     2     0      0     1
##   5324    0    2    0    0      0     1     0     0      0     0
##   6455    0    3    0    0      0     4     0     0      0     0
##   6713    0    0    0    0      0     0     0     0      0     0

4 EDA

library(glue)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Sorting frekuensi kemunculan secara descending
text_hotel <- as.matrix(hotel_dtm)
hotel_list <- sort(colSums(text_hotel), decreasing = T)

# Membuat data ke bentuk dataframe untuk kebutuhan visualisasi
hotel_df <- data.frame(word = names(hotel_list), freq=hotel_list)
hotel_df <- hotel_df %>% 
  mutate(label = glue(
    "Frekuensi: {freq}"))

4.1 Worldcloud

library(wordcloud2)
library(RColorBrewer)
colors.wc <- brewer.pal(8, "Dark2")

wordcloud2(hotel_df, size = 1.5)

4.2 Frekuensi Kata Tertinggi

plot <- ggplot(head(hotel_df,7), aes(y = reorder(word,freq), x = freq)) +
  geom_col(aes(fill = freq, text = label), show.legend = F) +
  labs(x = "Frekuensi",
       y = "Terms/Kata",
       title = "Frekuensi Kata Tertinggi") +
  #scale_x_continuous(labels = "kata") +
  scale_fill_gradient(low = "#85c946", high = "#304919") +
  theme_minimal() +
  theme(axis.text.y = element_text(face = "bold", size = 11))
## Warning in geom_col(aes(fill = freq, text = label), show.legend = F): Ignoring
## unknown aesthetics: text
ggplotly(plot, tooltip = "text")

5 Cross Validation

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)

# train-test splitting
index <- sample(nrow(hotel_dtm), nrow(hotel_dtm)*0.75)

# code here
hotel_train <- hotel_dtm[index,]
hotel_test <- hotel_dtm[-index,]

6 Model Fitting

Akan dilakukan pembuatan model terhadap untuk setiap variabel target

6.1 Model Variabel General

Memilih target variable (general)

train_labels_gen <- hotel[index,2]
test_labels_gen <- hotel[-index,2]

Cek label

check ham and spam message distribution in train and test set

prop.table(table(train_labels_gen))
## train_labels_gen
##         0         1 
## 0.8652465 0.1347535
prop.table(table(test_labels_gen))
## test_labels_gen
##         0         1 
## 0.8610912 0.1389088

Data Preparation lanjutan

Menghilangkan kata jarang muncul

print(dim(hotel_train))
## [1] 8356 9139

Prediktor memiliki 3493 kata, dilakukan penghapusan kata yang jarang muncul (dibawah 4)

freq <- findFreqTerms(hotel_train,lowfreq = 10)

length(freq)
## [1] 900
hotel_train_gen_filter <- hotel_train[,freq]
inspect(hotel_train_gen_filter)
## <<DocumentTermMatrix (documents: 8356, terms: 900)>>
## Non-/sparse entries: 67536/7452864
## Sparsity           : 99%
## Maximal term length: 13
## Weighting          : term frequency (tf)
## Sample             :
##        Terms
## Docs    air airy ayan baik bau bersih hotel kamar kurang mandi
##   10633   2    0    0    0   0      0     2     2      1     1
##   11094   1    1    0    0   0      0     0     2      1     0
##   11108   0    0    0    0   2      0     0     3      0     1
##   3307    0    1    0    0   0      1     1     4      0     1
##   3936    0    1    0    0   0      0     0     0      0     0
##   460     0    0    0    0   0      1     0     1      0     0
##   5413    1    1    0    0   2      0     0     1      0     0
##   572     0    0    0    0   0      0     0     5      0     1
##   5871    1    0    0    0   0      0     0     2      2     0
##   7135    0    1    0    0   0      0     0     2      0     1

Mengurangi data imbalance dengan SMOTE

library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.3.1
hotel_train_gen_df <- as.data.frame(as.matrix(hotel_train_gen_filter))
train_smote_gen <- SMOTE(X = hotel_train_gen_df, 
                          target = train_labels_gen, 
                          dup_size = 5)

train_smote_gen <- train_smote_gen$data # extract only the balanced dataset
train_smote_gen$class <- as.factor(train_smote_gen$class)

prop.table(table(train_smote_gen$class))
## 
##         0         1 
## 0.5169455 0.4830545
hotel_test_df <- as.data.frame(as.matrix(hotel_test))
naive_model <- naiveBayes(class ~.,
                          data = train_smote_gen,
                          laplace = 1)
general_predict_bayes <- predict(naive_model, hotel_test_df)
confusionMatrix(general_predict_bayes, test_labels_gen)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  296   15
##          1 2103  372
##                                          
##                Accuracy : 0.2398         
##                  95% CI : (0.224, 0.2561)
##     No Information Rate : 0.8611         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0259         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.1234         
##             Specificity : 0.9612         
##          Pos Pred Value : 0.9518         
##          Neg Pred Value : 0.1503         
##              Prevalence : 0.8611         
##          Detection Rate : 0.1062         
##    Detection Prevalence : 0.1116         
##       Balanced Accuracy : 0.5423         
##                                          
##        'Positive' Class : 0              
## 
library(partykit)
## Warning: package 'partykit' was built under R version 4.3.1
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 4.3.1
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.1
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# forest_train_gen <- train_smote_gen %>%
#   select(-class)
# forest_label_gen <- unlist(train_smote_gen %>% select(class))
# forest_gen_smote <- randomForest( x = forest_train_gen,
#                                   y = forest_label_gen,
#                                   ntree = 500)
#saveRDS(forest_gen_smote, "forest_general_posneg.rds")
forest_gen_smote <- readRDS("forest_general_posneg.rds")
general_predict <- predict(forest_gen_smote, hotel_test_df)
confusionMatrix(general_predict, test_labels_gen)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2308  150
##          1   91  237
##                                           
##                Accuracy : 0.9135          
##                  95% CI : (0.9024, 0.9237)
##     No Information Rate : 0.8611          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6137          
##                                           
##  Mcnemar's Test P-Value : 0.0001869       
##                                           
##             Sensitivity : 0.9621          
##             Specificity : 0.6124          
##          Pos Pred Value : 0.9390          
##          Neg Pred Value : 0.7226          
##              Prevalence : 0.8611          
##          Detection Rate : 0.8284          
##    Detection Prevalence : 0.8823          
##       Balanced Accuracy : 0.7872          
##                                           
##        'Positive' Class : 0               
## 
library(e1071)
#svm_model <- svm(class ~ ., data = train_smote_gen, kernel = "linear")
#saveRDS(svm_model, "forest_general_svm.rds")
svm_model <- readRDS("forest_general_svm.rds")
svm_predict <- predict(svm_model, hotel_test_df)
confusionMatrix(svm_predict, test_labels_gen)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2137  108
##          1  262  279
##                                          
##                Accuracy : 0.8672         
##                  95% CI : (0.854, 0.8796)
##     No Information Rate : 0.8611         
##     P-Value [Acc > NIR] : 0.1833         
##                                          
##                   Kappa : 0.5242         
##                                          
##  Mcnemar's Test P-Value : 1.805e-15      
##                                          
##             Sensitivity : 0.8908         
##             Specificity : 0.7209         
##          Pos Pred Value : 0.9519         
##          Neg Pred Value : 0.5157         
##              Prevalence : 0.8611         
##          Detection Rate : 0.7670         
##    Detection Prevalence : 0.8058         
##       Balanced Accuracy : 0.8059         
##                                          
##        'Positive' Class : 0              
##