options(scipen = 999)
library(tm)

## Loading required package: NLP

library(SnowballC)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.0     v forcats 0.4.0
## v readr   1.3.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()

library(devtools)

## Loading required package: usethis

library(katadasaR)
library(tokenizers)
library(wordcloud)

## Loading required package: RColorBrewer

library(NLP)

Read data

Data was obtained from Indonesian Institute of Sciences (Lipi). There were some amount of cleaning already done in the term of removing missing value which consist of 2 data sets, positive and negative sentiment. The aim of this analysis is to detect sentiment polarity in Indonesian user generated text.

Two data sets are combined into one dataframe

neg <- read.csv("data_input/olshop_negative.csv", sep = "|")
pos <- read.csv("data_input/olshop_positive.csv", sep = "|")

sentimen <- merge(pos, neg, all = TRUE)
sentimen

str(sentimen)

## 'data.frame':    12319 obs. of  4 variables:
##  $ no         : Factor w/ 12319 levels "100","10003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ title      : Factor w/ 8942 levels "'' Bukalapak adalah Jaminan Kepuasan Pelanggan\"",..: 4859 619 3387 3858 5970 4307 5391 4338 3173 4800 ...
##  $ text       : Factor w/ 11934 levels "-","- rate is always the cheapest - point is good - straightforward term and condition - many easy ways of payment "| __truncated__,..: 7630 811 4065 652 9071 3372 6800 8895 3430 6840 ...
##  $ senti_value: Factor w/ 7 levels "4","5","senti_value",..: 2 1 2 2 2 1 1 2 2 1 ...

no : Number of data
title : The heading of comment
text : The main of comment which delivered customer expression

Pre-Processing

As we only need title and text column so that take them into sentimen1 and combined into new column named comb

sentimen1 <- sentimen %>% 
  select(title, text)

sentimen1$comb <- paste(sentimen1$title, sentimen1$text)

Checking missing value in dataset

colSums(is.na(sentimen1))

## title  text  comb 
##     0     0     0

Alter dataframe into corpus

#VCorpus(VectorSource(sentimen12))

sentimen1_corpus <- sentimen1 %>%
  pull(comb) %>% 
  VectorSource() %>%   
  VCorpus()

sentimen1_corpus[[1]]$content

## [1] "Pesan barang di Ebay dlm satu klik Sekarang mau blanja barang apa ajah di Ebay mudah sekali, tidak direpotkan dgn urusan pajak, ongkir, bea cukai dll. prosesnya semudah blanja barang di dlm negeri dgn pilihan pembayaran yg beragam dan aman. Tidak perlu khawatir barang tidak akan sampai atau tersasar. tinggal klik, bayar dan tunggu barang sampai di rumah."

stopkata <- suppressWarnings(readLines("data_input/stopwords-id.txt"))

The next main step is transformation of the Corpus, so that the corpus is ready for our analysis. Transformation involves performing the following steps.

-Remove Punctuation -Convert to lower case -Remove stopwords such as dont, can, etc using the lexicon available in the tm package -Replace numbers with words -Remove brackets -Remove whitespaces -Stem document which involves stemming words into a root form i.e words such as “serve”, “service”, “server” are stemmed to a common root word “serv”. Stemming is optional as it might lead to a loss of context. We have used SnowballC package to perform the stemming. -Transform the documents to a Term Document Matrix, so that we get a matrix of terms and their frequencies, which can then be converted to a normal matrix and we can perform Analytical tasks.

sen_t <- sentimen1_corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(removeNumbers) %>% 
  tm_map(stripWhitespace) %>% 
  tm_map(stemDocument)

Using stopword in Indonesia

sen_t1 <- sen_t %>% 
  tm_map(removeWords, stopkata)

stemming_bahasa <- content_transformer(function(x){
  paste(sapply(words(x),katadasar),collapse = " ")
})

sen_t12 <- tm_map(sen_t1, stemming_bahasa)

sen_t12[[1]]$content

## [1] "pesan barang ebay dlm klik blanja barang ajah ebay mudah repot dgn urus pajak ongkir bea cukai dll proses mudah blanja barang dlm neger dgn pilih bayar yg agam aman khawatir barang sasar tinggal klik bayar tunggu barang rumah"

sen_t12[[16]]$content

## [1] "good job belanja bliblicom five star abissss"

#Stem document
clean_corpus <- tm_map(sen_t12, stemDocument)
#create Term Document Matrix
clean_dtm <- DocumentTermMatrix(clean_corpus)
#Converting TDM to matrix for analysis
clean_m <- as.matrix(clean_dtm)

We can build a word cloud

term_freq <- rowSums(clean_m)
term_freq <- sort(term_freq, decreasing = T)
head(term_freq)

## 11822 10342 11782 11809 11813 11915 
##   347   278   271   266   266   260

head(sen_t, 3)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3

wordcloud(sen_t12, min.freq = 10, max.words=100, random.order = F, colors=brewer.pal(8, "Set2"))

Sentiment Analysis User-Generated Indonesia Text

Read data

Pre-Processing

Alter dataframe into corpus

Using stopword in Indonesia