library(dplyr)
library(tidytext)
library(textclean)
library(tm)
library(SnowballC)
library(stringr)
library(rsample)
library(caret)
library(e1071)
library(tidymodels)
library(randomForest)data_movie <- read.csv("dataset_movie_review.csv")
head(data_movie)colSums(is.na(data_movie))## review sentiment
## 0 0
Split data menjadi data train dan data test. Data yang saya split menjadi 80% data train dan 20% data test.
RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(406)
index <- initial_split(data = data_movie, prop = 0.8, strata = "sentiment")
train <- training(index)
test <- testing(index)data_movie_train <- train %>% # select seq column
mutate(review_clean = review %>%
str_to_lower() %>% # convert all the string to low alphabet
replace_contraction() %>% # replace contraction to their multi-word forms
replace_internet_slang() %>% # replace internet slang to normal words
replace_word_elongation() %>% # replace informal writing with known semantic replacements
replace_number(remove = T) %>% # remove number
replace_date(replacement = "") %>% # remove date
str_remove_all(pattern = "[[:punct:]]") %>% # remove punctuation
str_remove_all(pattern = "©") %>%
replace_html(symbol = T) %>% # remove html tags
str_squish() %>% # reduces repeated whitespace inside a string.
str_trim()# removes whitespace from start and end of string)
)data_movie_test <- test %>% # select seq column
mutate(review_clean = review %>%
str_to_lower() %>% # convert all the string to low alphabet
replace_contraction() %>% # replace contraction to their multi-word forms
replace_internet_slang() %>% # replace internet slang to normal words
replace_word_elongation() %>% # replace informal writing with known semantic replacements
replace_number(remove = T) %>% # remove number
replace_date(replacement = "") %>% # remove date
str_remove_all(pattern = "[[:punct:]]") %>% # remove punctuation
str_remove_all(pattern = "©") %>%
replace_html(symbol = T) %>% # remove html tags
str_squish() %>% # reduces repeated whitespace inside a string.
str_trim()# removes whitespace from start and end of string)
)# saveRDS(data_movie_train, "data_movie_train.rds")
data_movie_train <- readRDS("data_movie_train.rds")
head(data_movie_train)# saveRDS(data_movie_test, "ddata_movie_test.rds")
data_movie_test <- readRDS("ddata_movie_test.rds")
head(data_movie_test)prop.table(table(data_movie_train$sentiment))##
## negative positive
## 0.49985 0.50015
Kemudian saya melakukan tokenize, menghilangkan stopword dan melakukan tf-idf pada data Train.
corp_train <- VCorpus(VectorSource(data_movie_train$review_clean))
corp_dtm_awal <- corp_train %>%
# use pre-build english stopwords
tm_map(removeWords, stopwords("en")) %>%
# convert corpus to document term matrix
DocumentTermMatrix(control = list(weighting = weightTfIdf))
corp_dtm_awal_clean <- removeSparseTerms(corp_dtm_awal, 0.85)inspect(corp_dtm_awal_clean)## <<DocumentTermMatrix (documents: 40000, terms: 50)>>
## Non-/sparse entries: 516011/1483989
## Sparsity : 74%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs bad film good great just like
## 171 0.3742526 0.00000000 0.00000000 0.0000000 0.00000000 0.00000000
## 19483 0.0000000 0.00000000 0.05354301 0.0000000 0.02372109 0.02118797
## 21781 0.1060382 0.00000000 0.07094449 0.0000000 0.00000000 0.00000000
## 23575 0.0000000 0.00000000 0.05675559 0.0000000 0.05028871 0.00000000
## 24211 0.1631358 0.00000000 0.00000000 0.3081811 0.00000000 0.00000000
## 26833 0.0963984 0.03980928 0.00000000 0.0000000 0.05714626 0.00000000
## 2728 0.6059328 0.00000000 0.10134927 0.0000000 0.00000000 0.00000000
## 28138 0.0000000 0.00000000 0.00000000 0.0000000 0.05238407 0.04679011
## 31708 0.0000000 0.00000000 0.00000000 0.0000000 0.00000000 0.14037032
## 503 0.0000000 0.00000000 0.00000000 0.2003177 0.12572177 0.00000000
## Terms
## Docs movie one really story
## 171 0.04317077 0.00000000 0.10248619 0.20889285
## 19483 0.09693060 0.01594759 0.09861878 0.00000000
## 21781 0.07339031 0.00000000 0.00000000 0.00000000
## 23575 0.08806837 0.06761779 0.00000000 0.00000000
## 24211 0.00000000 0.00000000 0.00000000 0.27316757
## 26833 0.00000000 0.03841920 0.07919387 0.00000000
## 2728 0.05242165 0.00000000 0.00000000 0.12682780
## 28138 0.06115859 0.00000000 0.00000000 0.07398288
## 31708 0.09173789 0.00000000 0.00000000 0.00000000
## 503 0.00000000 0.00000000 0.00000000 0.00000000
head(corp_dtm_awal_clean)## <<DocumentTermMatrix (documents: 6, terms: 50)>>
## Non-/sparse entries: 60/240
## Sparsity : 80%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
# convert train dtm to dataframe
data_movie.clean <- as.data.frame(as.matrix(corp_dtm_awal_clean), stringsAsFactors = F)colSums(is.na(data_movie.clean))## acting also bad best better can character
## 0 0 0 0 0 0 0
## characters end even ever film films first
## 0 0 0 0 0 0 0
## get good great just know life like
## 0 0 0 0 0 0 0
## little love made make many movie movies
## 0 0 0 0 0 0 0
## much never one people plot really say
## 0 0 0 0 0 0 0
## scene scenes see seen something still story
## 0 0 0 0 0 0 0
## think time two watch watching way well
## 0 0 0 0 0 0 0
## will
## 0
corp_test <- VCorpus(VectorSource(data_movie_test$review_clean))
corp_dtm_awal_test <- corp_test %>%
# use pre-build english stopwords
tm_map(removeWords, stopwords("en")) %>%
# convert corpus to document term matrix
DocumentTermMatrix(control = list(weighting = weightTfIdf,
dictionary = names(data_movie.clean)))## Warning in weighting(x): empty document(s): 5267 5751 6737 8315 8572 9439
inspect(corp_dtm_awal_test)## <<DocumentTermMatrix (documents: 9998, terms: 50)>>
## Non-/sparse entries: 128453/371447
## Sparsity : 74%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs can film good great just like movie one story will
## 4249 0 0 0 0 0 0.05994459 0 0 0 0
## 4808 0 0 0 0 0 0.00000000 0 0 0 0
## 489 0 0 0 0 0 0.00000000 0 0 0 0
## 6042 0 0 0 0 0 0.00000000 0 0 0 0
## 695 0 0 0 0 0 0.00000000 0 0 0 0
## 7342 0 0 0 0 0 0.00000000 0 0 0 0
## 7645 0 0 0 0 0 0.00000000 0 0 0 0
## 8359 0 0 0 0 0 0.00000000 0 0 0 0
## 8457 0 0 0 0 0 0.00000000 0 0 0 0
## 9139 0 0 0 0 0 0.00000000 0 0 0 0
data_movie.clean_test <- as.data.frame(as.matrix(corp_dtm_awal_test), stringsAsFactors = F)data_movie.clean$y_label <- as.factor(data_movie_train$sentiment)
data_movie.clean_test$y_label <- as.factor(data_movie_test$sentiment)train_rf <- data_movie.clean
test_rf <- data_movie.clean_test
colnames(train_rf) <- make.names(colnames(train_rf))
colnames(test_rf) <- make.names(colnames(test_rf))
head(train_rf)model.rf <- rand_forest(trees = 500, mtry = 4, mode = "classification") %>%
set_engine("ranger") %>% fit(y_label~., data = train_rf)
pred.rf <- predict(model.rf, test_rf,
type = "class")
pred.rf.x <- as.data.frame(cbind(pred.rf, test_rf$y_label)) %>%
setNames(c("pred","actual"))
pred.rf.xconfusionMatrix(data = pred.rf.x$pred,
reference = pred.rf.x$actual,
positive = "positive")## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 3166 719
## positive 1833 4280
##
## Accuracy : 0.7447
## 95% CI : (0.7361, 0.7533)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4895
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8562
## Specificity : 0.6333
## Pos Pred Value : 0.7001
## Neg Pred Value : 0.8149
## Prevalence : 0.5000
## Detection Rate : 0.4281
## Detection Prevalence : 0.6114
## Balanced Accuracy : 0.7447
##
## 'Positive' Class : positive
##
# JANGAN DIRUN
ctrl <- trainControl(method="repeatedcv", number = 3, repeats = 2)
fb_forest <- train(y_label ~ ., data = train_rf, method = "rf", trControl = ctrl)
saveRDS(fb_forest, "fb_forest.RDS") # simpan modelfb_forest <- readRDS("fb_forest.RDS")fb_forest## Random Forest
##
## 40000 samples
## 50 predictor
## 2 classes: 'negative', 'positive'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 2 times)
## Summary of sample sizes: 26668, 26666, 26666, 26667, 26666, 26667, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7122125 0.4244237
## 26 0.7025750 0.4051451
## 50 0.7011874 0.4023702
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
kata_terpenting <- (varImp(fb_forest)$importance)
kata_terpentingkata_terpenting[5,1]## [1] 10.0228
kata_seleksi <- (varImp(fb_forest)$importance) %>%
filter(Overall >= 8.000000)
kata_seleksilength(corp_dtm_awal_test)## [1] 6
bernoulli_conv <- function(x){
x <- as.factor(ifelse(x > 10.000000, 1, 0))
return(x)
}train_naive <- apply(X = kata_seleksi, MARGIN = 2, FUN = bernoulli_conv)
test_naive <- apply(X = kata_seleksi, MARGIN = 2, FUN = bernoulli_conv)
label_train_naive <- as.factor(train$sentiment)
label_test_naive <- as.factor(test$sentiment)kata_terseleksi <- varImp(fb_forest)
kata_terseleksi$importance %>%
as.data.frame() %>%
filter(Overall >= 8.000000) %>%
rownames_to_column("kata_terseleksi") %>%
pull(kata_terseleksi)## [1] "acting" "also" "bad" "best" "better" "can" "even" "film"
## [9] "first" "get" "good" "great" "just" "life" "like" "love"
## [17] "made" "make" "movie" "much" "one" "people" "plot" "really"
## [25] "see" "still" "story" "think" "time" "way" "well" "will"