This section contains modelling using text mining algorithms to gain
insights from the data. The choice of model that was used depended on
the nature of the data and the problem being solved, as well as the
goals and objectives of writing this data mining project.
3.3 Sentiment Analysis on Amazon Earphone Review Dataset
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(df$ReviewBody))
# Quick View of the data
head(df)
## # A tibble: 6 x 4
## ReviewTitle Revie~1 Revie~2 Product
## <chr> <chr> <dbl> <chr>
## 1 "Honest review of an edm music lover\n" "No do~ 3 boAt R~
## 2 "Unreliable earphones with high cost\n" "This ~ 1 boAt R~
## 3 "Really good and durable.\n" "i bou~ 4 boAt R~
## 4 "stopped working in just 14 days\n" "Its s~ 1 boAt R~
## 5 "Just Awesome Wireless Headphone under 1000...\U0001f~ "Its A~ 5 boAt R~
## 6 "Charging port not working\n" "After~ 1 boAt R~
## # ... with abbreviated variable names 1: ReviewBody, 2: ReviewStar
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("https"))
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by desceasing value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq=dtm_v)
# Display the top 8 most frequent words
head(dtm_d, 8)
## word freq
## good good 6930
## sound sound 6202
## qualiti qualiti 5945
## product product 4956
## bass bass 2753
## earphon earphon 2615
## use use 2510
## work work 2024
# This is the barchart representation of the Top 5 Words most appeared
barplot(dtm_d[1:8,]$freq, las = 2, names.arg = dtm_d[1:8,]$word,
col = hsv(1, 1, seq(0,1,length.out = 10)), main ="Top 8 most frequent words",
ylab = "Word frequencies")

#generate word cloud
set.seed(1234)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 5,
max.words=100, random.order=FALSE, rot.per=0.40,
colors=brewer.pal(8, "Dark2"))

# Find associations
findAssocs(TextDoc_dtm, terms = c("good","sound","qualiti"), corlimit = 0.15)
## $good
## bass batteri also life overal
## 0.20 0.19 0.18 0.18 0.15
##
## $sound
## bass earphon clear
## 0.24 0.18 0.16
##
## $qualiti
## build built wire also
## 0.31 0.18 0.16 0.15
With a correlation limit of 0.15, words like “battery”, “life”,
“overall” etc were associated with “good” while words like “earphone”,
“clear”, “bass” were associated with “sound”.
# regular sentiment score using get_sentiment() function
syuzhet_vector <- get_sentiment(df$ReviewBody, method = "syuzhet")
summary(syuzhet_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.100 0.250 0.750 1.045 1.600 16.100
# using bing method
bing_vector <- get_sentiment(df$ReviewBody, method="bing")
summary(bing_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.000 0.000 1.000 1.124 2.000 14.000
# using affin method
afinn_vector <- get_sentiment(df$ReviewBody, method="afinn")
summary(afinn_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.000 1.000 3.000 3.845 6.000 46.000
# compare the first row of each vector using sign function
rbind(
sign(head(syuzhet_vector)),
sign(head(bing_vector)),
sign(head(afinn_vector))
)
It is evident that most of the sentiments on review were positive
because the Median using the three methods (syuzhet, bing and afinn)
were positive values.
# running nrc sentiment analysis
# NOTE: This process takes a great deal of RAM space so I limit the operation to be
# done on the first 1000 rows instead of the whole 14337 observations.
d <- get_nrc_sentiment(df$ReviewBody[1:1000])
#transpose
td<-data.frame(t(d))
# computing column sums across rows for each level of a grouping variable.
td_new <- data.frame(rowSums(td[2:1000]))
#Transformation and cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
# counting of words associated with each sentiment
quickplot(sentiment, data=td_new2, weight=count, geom="bar", fill=sentiment, ylab="count")+ggtitle("Product Sentiments")

# in the plot, trust and anticipation are the top sentiments (almost the same score)
# followed by joy sentiment and they are all positive sentiments.
# expressed as a percentage
barplot(
sort(colSums(prop.table(d[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in Text", xlab="Percentage"
)
Most of the reviews from the Amazon earphone products were positive and
this signifies that majority of the people who used the product were
pleases with the performance.
3.2.2 A Different Term Frequency Approach on Amazon Earphone
Reviews
# using unnest_tokens()
tidy_review <- df %>%
unnest_tokens(word, ReviewBody)
# counting words
tidy_review %>%
count(word) %>%
arrange(desc(n))
# using unnest_tokens() with stopwords
tidy_review2 <- df %>%
unnest_tokens(word, ReviewBody) %>%
anti_join(stop_words)
# the number of words is drastically reduced
tidy_review2
# counting words again
tidy_review2 %>%
count(word) %>%
arrange(desc(n))
# Visualizing text
# starting with tidy text
tidy_review <- df %>%
mutate(id = row_number()) %>%
unnest_tokens(word, ReviewBody) %>%
anti_join(stop_words)
tidy_review
# visualizing counts with geom_col() and filtering word count
# filter() before visualizing
word_counts <- tidy_review %>%
count(word) %>%
filter(n>700) %>%
arrange(desc(n))
# word count
ggplot(word_counts, aes(x=word, y=n)) +
geom_col() +
coord_flip() +
ggtitle("Review Word Counts")

## # A tibble: 1 x 2
## word lexicon
## <chr> <chr>
## 1 2 CUSTOM
## Joining, by = "word"
## # A tibble: 20 x 3
## word n word2
## <chr> <int> <fct>
## 1 awesome 1230 awesome
## 2 bass 2831 bass
## 3 battery 1937 battery
## 4 bluetooth 813 bluetooth
## 5 buy 1045 buy
## 6 cancellation 780 cancellation
## 7 ear 1182 ear
## 8 earphone 1064 earphone
## 9 earphones 1547 earphones
## 10 life 932 life
## 11 money 781 money
## 12 music 959 music
## 13 nice 1053 nice
## 14 noise 1086 noise
## 15 price 1588 price
## 16 product 4795 product
## 17 quality 5970 quality
## 18 range 747 range
## 19 sound 6015 sound
## 20 worth 731 worth
# ordered column
ggplot(word_counts, aes(x=word2, y=n)) +
geom_col() +
coord_flip() +
ggtitle("Review Word Counts")

# plot faceted by Product:
ggplot(word_counts, aes(x=word2, y=n, fill=Product)) +
geom_col(show.legend=FALSE) +
facet_wrap(~Product, scales="free_y") +
coord_flip() +
ggtitle("Product Counts")

# Review Star Count
ggplot(word_counts, aes(x=word2, y=n, fill=ReviewStar)) +
geom_col(show.legend=FALSE) +
facet_wrap(~ReviewStar, scales="free_y") +
coord_flip() +
ggtitle("Review Star Counts")
