library(dplyr)
library(ggplot2)
library(glue)
library(tm)
library(koRpus)
library(katadasaR)
library(tokenizers)
library(plotly)
library(wordcloud2)
library(RColorBrewer)
library(randomForest)text <- read.csv("unseen_datamini.csv", header = FALSE)
textanyNA(text)## [1] FALSE
function for cleansing :
## Slang
# Read your CSV file into a data frame
# Read the CSV file containing the slang word mappings
slang_data <- read.csv("new_kamusalay.csv", header = FALSE, stringsAsFactors = FALSE)
# Create a mapping from slang words to their meanings
slang_mapping <- setNames(slang_data$V2, slang_data$V1)
# Create a function to replace slang words with their meanings
replace_slang <- function(text) {
words <- unlist(strsplit(text, " ")) # Split text into words
replaced_words <- sapply(words, function(word) {
if (word %in% names(slang_mapping)) {
return(slang_mapping[word])
} else {
return(word)
}
})
return(paste(replaced_words, collapse = " "))
}
## Stopword
stopwords_id <- tolower(readLines("stopwords_id.txt"))## Warning in readLines("stopwords_id.txt"): incomplete final line found on
## 'stopwords_id.txt'
## Stemming
stem_katadasaR <- content_transformer(function(x) {
paste(sapply(unlist(tokenizers::tokenize_words(x)), katadasaR::katadasaR), collapse = ' ')
})corpus <- VCorpus(VectorSource(text$V1))
corpus_cl <- tm_map(x = corpus, FUN = removeNumbers) # Hapus Number
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(tolower)) # lowercase
corpus_cl <- tm_map(x = corpus_cl, FUN = content_transformer(replace_slang)) # remove slang
corpus_cl <- tm_map(x = corpus_cl, FUN = removeWords, stopwords_id) # stopword
corpus_cl <- tm_map(x = corpus_cl, FUN = removePunctuation) # Hapus punctuation
corpus_cl <- tm_map(corpus_cl, stem_katadasaR) # Stemming
corpus_cl <- tm_map(x = corpus_cl, FUN = stripWhitespace) # Whitespacescorpus_cl[[10]]$content## [1] "ac dingin kunci kamar pakai habis tap card keluar masuk kamar panggil tugas kamar ngengat bikin gatal"
DTM
text_dtm <- DocumentTermMatrix(corpus_cl)total_review <- nrow(text)
total_review## [1] 500
# Sorting frekuensi kemunculan secara descending
text_hotel <- as.matrix(text_dtm)
hotel_list <- sort(colSums(text_hotel), decreasing = T)
# Membuat data ke bentuk dataframe untuk kebutuhan visualisasi
hotel_df <- data.frame(word = names(hotel_list), freq=hotel_list)
hotel_df <- hotel_df %>%
mutate(label = glue(
"Frekuensi: {freq}"))plot <- ggplot(head(hotel_df,7), aes(y = reorder(word,freq), x = freq)) +
geom_col(aes(fill = freq, text = label), show.legend = F) +
labs(x = "Frekuensi",
y = "Terms/Kata",
title = "Frekuensi Kata Tertinggi") +
#scale_x_continuous(labels = "kata") +
scale_fill_gradient(low = "#85c946", high = "#304919") +
theme_minimal() +
theme(axis.text.y = element_text(face = "bold", size = 11))## Warning in geom_col(aes(fill = freq, text = label), show.legend = F): Ignoring
## unknown aesthetics: text
ggplotly(plot, tooltip = "text")colors.wc <- brewer.pal(8, "Dark2")
wordcloud2(hotel_df, size = 1.5)model_general <- readRDS("forest_general_posneg.rds")
model_ac <- readRDS("forest_ac.rds")
model_ap <- readRDS("forest_ap.rds")
model_bau <- readRDS("forest_bau.rds")
model_bersih <- readRDS("forest_bersih.rds")
model_linen <- readRDS("forest_linen.rds")
model_service <- readRDS("forest_service.rds")
model_sm <- readRDS("forest_sm.rds")
model_tv <- readRDS("forest_tv.rds")
model_wifi <- readRDS("forest_wifi.rds")
general_svm <- readRDS("forest_general_svm.rds")predict_df <- as.data.frame(as.matrix(text_dtm))
inspect(text_dtm)## <<DocumentTermMatrix (documents: 500, terms: 1314)>>
## Non-/sparse entries: 5288/651712
## Sparsity : 99%
## Maximal term length: 19
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs air ayan baik bau bersih dingin kamar kurang mandi panas
## 139 2 0 0 0 1 2 1 2 1 1
## 223 2 0 0 0 1 0 2 1 1 1
## 241 1 0 0 0 1 1 2 0 0 0
## 275 1 1 0 1 0 0 0 1 0 0
## 337 0 0 0 0 1 0 1 0 0 0
## 360 1 0 2 0 1 0 2 0 0 0
## 461 0 0 0 1 0 0 3 0 1 0
## 482 1 0 0 0 0 0 1 1 1 1
## 484 1 0 0 0 0 0 3 0 0 2
## 488 0 0 0 0 0 0 1 0 0 0
# general <- predict(model_general,predict_df,)
# AC <- predict(model_ac,predict_df)
# Airpanas <- predict(model_ap,predict_df)
# Bau <- predict(model_bau, predict_df)
# Kebersihan <- predict(model_bersih,predict_df)
# Linen <- predict(model_linen,predict_df)
# Service <- predict(model_service,predict_df)
# Sarapan <- predict(model_sm,predict_df)
# TV <- predict(model_tv,predict_df)
# wifi <- predict(model_wifi,predict_df)# out <- data.frame(text,general,Kebersihan,Linen, row.names = NULL)
# out