1 Input data $ Library

library(dplyr)
library(ggplot2)
library(glue)
library(tm)
library(koRpus)
library(katadasaR)
library(tokenizers)
library(plotly)
library(wordcloud2)
library(RColorBrewer)
library(randomForest)

text <- read.csv("unseen_datamini.csv", header = FALSE)

text

2 Preparation

anyNA(text)

## [1] FALSE

3 Data Cleansing

function for cleansing :

 ## Slang
# Read your CSV file into a data frame
# Read the CSV file containing the slang word mappings
slang_data <- read.csv("new_kamusalay.csv", header = FALSE, stringsAsFactors = FALSE)

# Create a mapping from slang words to their meanings
slang_mapping <- setNames(slang_data$V2, slang_data$V1)

# Create a function to replace slang words with their meanings
replace_slang <- function(text) {
  words <- unlist(strsplit(text, " "))  # Split text into words
  replaced_words <- sapply(words, function(word) {
    if (word %in% names(slang_mapping)) {
      return(slang_mapping[word])
    } else {
      return(word)
    }
  })
  return(paste(replaced_words, collapse = " "))
}

## Stopword

stopwords_id <- tolower(readLines("stopwords_id.txt"))

## Warning in readLines("stopwords_id.txt"): incomplete final line found on
## 'stopwords_id.txt'

## Stemming

stem_katadasaR <- content_transformer(function(x) {
  paste(sapply(unlist(tokenizers::tokenize_words(x)), katadasaR::katadasaR), collapse = ' ')
})

corpus <- VCorpus(VectorSource(text$V1))
corpus_cl <- tm_map(x = corpus, FUN = removeNumbers) # Hapus Number
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(tolower)) # lowercase
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(replace_slang)) # remove slang
corpus_cl <- tm_map(x = corpus_cl, FUN = removeWords, stopwords_id) # stopword
corpus_cl <- tm_map(x = corpus_cl, FUN = removePunctuation) # Hapus punctuation
corpus_cl <- tm_map(corpus_cl, stem_katadasaR) # Stemming
corpus_cl <- tm_map(x = corpus_cl, FUN = stripWhitespace) # Whitespaces

corpus_cl[[10]]$content

## [1] "ac dingin kunci kamar pakai habis tap card keluar masuk kamar panggil tugas kamar ngengat bikin gatal"

DTM

text_dtm <- DocumentTermMatrix(corpus_cl)

4 General EDA

total_review <- nrow(text)
total_review

## [1] 500

# Sorting frekuensi kemunculan secara descending
text_hotel <- as.matrix(text_dtm)
hotel_list <- sort(colSums(text_hotel), decreasing = T)

# Membuat data ke bentuk dataframe untuk kebutuhan visualisasi
hotel_df <- data.frame(word = names(hotel_list), freq=hotel_list)
hotel_df <- hotel_df %>% 
  mutate(label = glue(
    "Frekuensi: {freq}"))

plot <- ggplot(head(hotel_df,7), aes(y = reorder(word,freq), x = freq)) +
  geom_col(aes(fill = freq, text = label), show.legend = F) +
  labs(x = "Frekuensi",
       y = "Terms/Kata",
       title = "Frekuensi Kata Tertinggi") +
  #scale_x_continuous(labels = "kata") +
  scale_fill_gradient(low = "#85c946", high = "#304919") +
  theme_minimal() +
  theme(axis.text.y = element_text(face = "bold", size = 11))

## Warning in geom_col(aes(fill = freq, text = label), show.legend = F): Ignoring
## unknown aesthetics: text

ggplotly(plot, tooltip = "text")

colors.wc <- brewer.pal(8, "Dark2")

wordcloud2(hotel_df, size = 1.5)

5 Predict

model_general <- readRDS("forest_general_posneg.rds")
model_ac <- readRDS("forest_ac.rds")
model_ap <- readRDS("forest_ap.rds")
model_bau <- readRDS("forest_bau.rds")
model_bersih <- readRDS("forest_bersih.rds")
model_linen <- readRDS("forest_linen.rds")
model_service <- readRDS("forest_service.rds")
model_sm <- readRDS("forest_sm.rds")
model_tv <- readRDS("forest_tv.rds")
model_wifi <- readRDS("forest_wifi.rds")
general_svm <- readRDS("forest_general_svm.rds")

predict_df <- as.data.frame(as.matrix(text_dtm))
inspect(text_dtm)

## <<DocumentTermMatrix (documents: 500, terms: 1314)>>
## Non-/sparse entries: 5288/651712
## Sparsity           : 99%
## Maximal term length: 19
## Weighting          : term frequency (tf)
## Sample             :
##      Terms
## Docs  air ayan baik bau bersih dingin kamar kurang mandi panas
##   139   2    0    0   0      1      2     1      2     1     1
##   223   2    0    0   0      1      0     2      1     1     1
##   241   1    0    0   0      1      1     2      0     0     0
##   275   1    1    0   1      0      0     0      1     0     0
##   337   0    0    0   0      1      0     1      0     0     0
##   360   1    0    2   0      1      0     2      0     0     0
##   461   0    0    0   1      0      0     3      0     1     0
##   482   1    0    0   0      0      0     1      1     1     1
##   484   1    0    0   0      0      0     3      0     0     2
##   488   0    0    0   0      0      0     1      0     0     0

# general <- predict(model_general,predict_df,)
# AC <- predict(model_ac,predict_df)
# Airpanas <- predict(model_ap,predict_df)
# Bau <- predict(model_bau, predict_df)
# Kebersihan <- predict(model_bersih,predict_df)
# Linen <- predict(model_linen,predict_df)
# Service <- predict(model_service,predict_df)
# Sarapan <- predict(model_sm,predict_df)
# TV <- predict(model_tv,predict_df)
# wifi <- predict(model_wifi,predict_df)

# out <- data.frame(text,general,Kebersihan,Linen, row.names = NULL)
# out

Coba Predict - Hotel Sentiment Analysis

Rizky Fadilah

september 2023

1 Input data $ Library

2 Preparation

3 Data Cleansing

4 General EDA

5 Predict