library(dplyr)
library(tidytext)
library(textclean)
library(tm)
library(SnowballC)
library(stringr)
library(rsample)
library(caret)
library(e1071)
library(tidymodels)
library(randomForest)

Read Dataset

data_movie <- read.csv("dataset_movie_review.csv")
head(data_movie)
colSums(is.na(data_movie))
##    review sentiment 
##         0         0

Spliting data test dan data train

Split data menjadi data train dan data test. Data yang saya split menjadi 80% data train dan 20% data test.

RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(406)
index <- initial_split(data = data_movie, prop = 0.8, strata = "sentiment")
train <- training(index)
test <- testing(index)

Text Cleaning

Text Cleansing Data Train

data_movie_train <- train %>% # select seq column
  mutate(review_clean = review %>% 
           str_to_lower() %>% # convert all the string to low alphabet
           replace_contraction() %>% # replace contraction to their multi-word forms
           replace_internet_slang() %>% # replace internet slang to normal words
           replace_word_elongation() %>% # replace informal writing with known semantic replacements
           replace_number(remove = T) %>% # remove number
           replace_date(replacement = "") %>% # remove date
           str_remove_all(pattern = "[[:punct:]]") %>% # remove punctuation
           str_remove_all(pattern = "©") %>% 
           replace_html(symbol = T) %>% # remove html tags
           str_squish() %>% # reduces repeated whitespace inside a string.
           str_trim()# removes whitespace from start and end of string)
  )

Text Cleansing Data Test

data_movie_test <- test %>% # select seq column
  mutate(review_clean = review %>% 
           str_to_lower() %>% # convert all the string to low alphabet
           replace_contraction() %>% # replace contraction to their multi-word forms
           replace_internet_slang() %>% # replace internet slang to normal words
           replace_word_elongation() %>% # replace informal writing with known semantic replacements
           replace_number(remove = T) %>% # remove number
           replace_date(replacement = "") %>% # remove date
           str_remove_all(pattern = "[[:punct:]]") %>% # remove punctuation
           str_remove_all(pattern = "©") %>% 
           replace_html(symbol = T) %>% # remove html tags
           str_squish() %>% # reduces repeated whitespace inside a string.
           str_trim()# removes whitespace from start and end of string)
  )
# saveRDS(data_movie_train, "data_movie_train.rds")
data_movie_train <- readRDS("data_movie_train.rds")
head(data_movie_train)
# saveRDS(data_movie_test, "ddata_movie_test.rds")
data_movie_test <- readRDS("ddata_movie_test.rds")
head(data_movie_test)
prop.table(table(data_movie_train$sentiment))
## 
## negative positive 
##  0.49985  0.50015

Kemudian saya melakukan tokenize, menghilangkan stopword dan melakukan tf-idf pada data Train.

corp_train <- VCorpus(VectorSource(data_movie_train$review_clean))

corp_dtm_awal <- corp_train %>% 
  # use pre-build english stopwords
  tm_map(removeWords, stopwords("en")) %>%
  # convert corpus to document term matrix
  DocumentTermMatrix(control = list(weighting = weightTfIdf))

corp_dtm_awal_clean <- removeSparseTerms(corp_dtm_awal, 0.85)
inspect(corp_dtm_awal_clean)
## <<DocumentTermMatrix (documents: 40000, terms: 50)>>
## Non-/sparse entries: 516011/1483989
## Sparsity           : 74%
## Maximal term length: 10
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##        Terms
## Docs          bad       film       good     great       just       like
##   171   0.3742526 0.00000000 0.00000000 0.0000000 0.00000000 0.00000000
##   19483 0.0000000 0.00000000 0.05354301 0.0000000 0.02372109 0.02118797
##   21781 0.1060382 0.00000000 0.07094449 0.0000000 0.00000000 0.00000000
##   23575 0.0000000 0.00000000 0.05675559 0.0000000 0.05028871 0.00000000
##   24211 0.1631358 0.00000000 0.00000000 0.3081811 0.00000000 0.00000000
##   26833 0.0963984 0.03980928 0.00000000 0.0000000 0.05714626 0.00000000
##   2728  0.6059328 0.00000000 0.10134927 0.0000000 0.00000000 0.00000000
##   28138 0.0000000 0.00000000 0.00000000 0.0000000 0.05238407 0.04679011
##   31708 0.0000000 0.00000000 0.00000000 0.0000000 0.00000000 0.14037032
##   503   0.0000000 0.00000000 0.00000000 0.2003177 0.12572177 0.00000000
##        Terms
## Docs         movie        one     really      story
##   171   0.04317077 0.00000000 0.10248619 0.20889285
##   19483 0.09693060 0.01594759 0.09861878 0.00000000
##   21781 0.07339031 0.00000000 0.00000000 0.00000000
##   23575 0.08806837 0.06761779 0.00000000 0.00000000
##   24211 0.00000000 0.00000000 0.00000000 0.27316757
##   26833 0.00000000 0.03841920 0.07919387 0.00000000
##   2728  0.05242165 0.00000000 0.00000000 0.12682780
##   28138 0.06115859 0.00000000 0.00000000 0.07398288
##   31708 0.09173789 0.00000000 0.00000000 0.00000000
##   503   0.00000000 0.00000000 0.00000000 0.00000000
head(corp_dtm_awal_clean)
## <<DocumentTermMatrix (documents: 6, terms: 50)>>
## Non-/sparse entries: 60/240
## Sparsity           : 80%
## Maximal term length: 10
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)

Modelling

# convert train dtm to dataframe
data_movie.clean <- as.data.frame(as.matrix(corp_dtm_awal_clean), stringsAsFactors = F)
colSums(is.na(data_movie.clean))
##     acting       also        bad       best     better        can  character 
##          0          0          0          0          0          0          0 
## characters        end       even       ever       film      films      first 
##          0          0          0          0          0          0          0 
##        get       good      great       just       know       life       like 
##          0          0          0          0          0          0          0 
##     little       love       made       make       many      movie     movies 
##          0          0          0          0          0          0          0 
##       much      never        one     people       plot     really        say 
##          0          0          0          0          0          0          0 
##      scene     scenes        see       seen  something      still      story 
##          0          0          0          0          0          0          0 
##      think       time        two      watch   watching        way       well 
##          0          0          0          0          0          0          0 
##       will 
##          0

Preparing Data Test

corp_test <- VCorpus(VectorSource(data_movie_test$review_clean))

corp_dtm_awal_test <- corp_test %>% 
  # use pre-build english stopwords
  tm_map(removeWords, stopwords("en")) %>%
  # convert corpus to document term matrix
  DocumentTermMatrix(control = list(weighting = weightTfIdf,
                                    dictionary = names(data_movie.clean)))
## Warning in weighting(x): empty document(s): 5267 5751 6737 8315 8572 9439
inspect(corp_dtm_awal_test)
## <<DocumentTermMatrix (documents: 9998, terms: 50)>>
## Non-/sparse entries: 128453/371447
## Sparsity           : 74%
## Maximal term length: 10
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##       Terms
## Docs   can film good great just       like movie one story will
##   4249   0    0    0     0    0 0.05994459     0   0     0    0
##   4808   0    0    0     0    0 0.00000000     0   0     0    0
##   489    0    0    0     0    0 0.00000000     0   0     0    0
##   6042   0    0    0     0    0 0.00000000     0   0     0    0
##   695    0    0    0     0    0 0.00000000     0   0     0    0
##   7342   0    0    0     0    0 0.00000000     0   0     0    0
##   7645   0    0    0     0    0 0.00000000     0   0     0    0
##   8359   0    0    0     0    0 0.00000000     0   0     0    0
##   8457   0    0    0     0    0 0.00000000     0   0     0    0
##   9139   0    0    0     0    0 0.00000000     0   0     0    0
data_movie.clean_test <- as.data.frame(as.matrix(corp_dtm_awal_test), stringsAsFactors = F)
data_movie.clean$y_label <- as.factor(data_movie_train$sentiment)

data_movie.clean_test$y_label <-  as.factor(data_movie_test$sentiment)

Feature Selection

train_rf <- data_movie.clean
test_rf <- data_movie.clean_test

colnames(train_rf) <- make.names(colnames(train_rf))

colnames(test_rf) <- make.names(colnames(test_rf))

head(train_rf)
model.rf <- rand_forest(trees = 500, mtry = 4, mode = "classification") %>%
  set_engine("ranger") %>% fit(y_label~., data = train_rf)

pred.rf <- predict(model.rf, test_rf, 
                   type = "class")

pred.rf.x <- as.data.frame(cbind(pred.rf, test_rf$y_label)) %>%
  setNames(c("pred","actual"))

pred.rf.x
confusionMatrix(data = pred.rf.x$pred, 
                reference = pred.rf.x$actual, 
                positive = "positive")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction negative positive
##   negative     3166      719
##   positive     1833     4280
##                                           
##                Accuracy : 0.7447          
##                  95% CI : (0.7361, 0.7533)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4895          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8562          
##             Specificity : 0.6333          
##          Pos Pred Value : 0.7001          
##          Neg Pred Value : 0.8149          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4281          
##    Detection Prevalence : 0.6114          
##       Balanced Accuracy : 0.7447          
##                                           
##        'Positive' Class : positive        
## 
# JANGAN DIRUN
ctrl <- trainControl(method="repeatedcv", number = 3, repeats = 2)

fb_forest <- train(y_label ~ ., data = train_rf, method = "rf", trControl = ctrl)

saveRDS(fb_forest, "fb_forest.RDS") # simpan model
fb_forest <-  readRDS("fb_forest.RDS")
fb_forest
## Random Forest 
## 
## 40000 samples
##    50 predictor
##     2 classes: 'negative', 'positive' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 2 times) 
## Summary of sample sizes: 26668, 26666, 26666, 26667, 26666, 26667, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7122125  0.4244237
##   26    0.7025750  0.4051451
##   50    0.7011874  0.4023702
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
kata_terpenting <- (varImp(fb_forest)$importance)
kata_terpenting
kata_terpenting[5,1]
## [1] 10.0228
kata_seleksi <- (varImp(fb_forest)$importance) %>% 
  filter(Overall >= 8.000000)

kata_seleksi
length(corp_dtm_awal_test)
## [1] 6

Naive Bayes

bernoulli_conv <- function(x){
  x <- as.factor(ifelse(x > 10.000000, 1, 0))
  return(x)
}

Split Data Train dan Data Test

train_naive <- apply(X = kata_seleksi, MARGIN = 2, FUN = bernoulli_conv)
test_naive <- apply(X = kata_seleksi, MARGIN = 2, FUN = bernoulli_conv)

label_train_naive <- as.factor(train$sentiment)
label_test_naive <- as.factor(test$sentiment)
kata_terseleksi <- varImp(fb_forest)
kata_terseleksi$importance %>% 
  as.data.frame() %>% 
  filter(Overall >= 8.000000) %>% 
  rownames_to_column("kata_terseleksi") %>% 
  pull(kata_terseleksi)
##  [1] "acting" "also"   "bad"    "best"   "better" "can"    "even"   "film"  
##  [9] "first"  "get"    "good"   "great"  "just"   "life"   "like"   "love"  
## [17] "made"   "make"   "movie"  "much"   "one"    "people" "plot"   "really"
## [25] "see"    "still"  "story"  "think"  "time"   "way"    "well"   "will"