In this Project, I am analyzing the sentiments of Consumer electronics, specifically on Camera reviews. I have taken 180 reviews of a camera from Adorama website.
Following libraries were used:
rvest,stringr,RCurl,XML,stringr,httr,knitr,dplyr,RMySQL,mscstexta4r,jsonlite,tm,RWeka,
SnowballC,caret,rminer,kernlab,rpart,caTools,party,FSelector,class,randomForest,
RTextTools,syuzhet
I initially started scraping in R. Since I was having issues like time out and its not working consistently, I used webscraper tool to extract the reviews.
options(stringsAsFactors = FALSE)
#site <- "https://www.adorama.com/r/inkd7100-reviews"
u <- "https://www.adorama.com/r/inkd7100-reviews?StartAt="
site <- lapply( seq(11,91,10), function(y){paste0(u, y, "#pr-review-sort")})
#do.call(rbind,lapply(site, function(x) {
#
# (data.frame(url=x,
# comments = read_html(GET(x,user_agent("myagent")))%>%
# html_nodes(".pr-comments")%>%html_text(),
# bot=read_html(GET(x,user_agent("myagent")))%>% #html_nodes(".pr-review-author-location")%>% html_text() %>% str_replace("from" , "" ) #%>% str_replace_all( "\\r\\n" , "") %>% str_replace_all( "-" , "") %>% str_replace_all( " Â " #, "") %>% trimws("l"))
#
# )
#
#}
#) )-> df11
#df <- merge (df, df, by=c("url","comments"))
#df3 <- data.frame(df1)
camera <- read.csv("C:/cuny/camera1.csv" ,header=TRUE,sep = ',')
df3 <- data.frame(camera)
sent <- get_nrc_sentiment(df3$comments)
#if (sent$positive >= sent$negative) {
# df3$rating = "Positive"} else { df3$rating = "Negative"}
df3$rating <- as.factor(ifelse(sent$positive >= sent$negative,"Positive","Negative"))
I initially used the Text Analytics API from Microsoft Azure to get a scoring on the reviews and it worked fine. I got issues with the second set of reviews that I used. Thought the return code was success, I had "no applicable method for 'content' applied to an object of class "response" error. So, I used package syuzhet to get the sentiment score.
#converting customer type to factor
#trim <- function (x) gsub("^\\s+|\\s+$", "", x)
#df3$bot <- trim(df3$bot)
#df3$bot <- str_replace(df3$bot,"","General")
#df3$bot[df$bot==""] <- "Unknown"
df3$bot <- as.factor(df3$bot)
#sentiment scores
# Below is the Request body for the API having text id 1 = Negative sentiments, id 2 = Positive sentiments
#docsLanguage <- rep("en", length(df3$comments))
#revseq<-seq(1,length(df3$comments), by =1)
#request_body <- data.frame(
#language = docsLanguage,
#id = revseq,
#text = df3$comments
#)
#request_body
# Converting the Request body(Dataframe) to Request body(JSON)
#request_body_json <- toJSON(list(documents = request_body), auto_unbox = TRUE)
#request_body_json
# Below we are calling API (Adding Request headers using add_headers)
#result1 <- POST("https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/sentiment",
#body = request_body_json,
#add_headers(.headers = c("Content-Type"="application/json",
# "Ocp-Apim-Subscription-Key"="e53ce6ebbdeb4be68dd85bc722418e23",
# "Accept" = "application/json")))
#Outputx <- content(result1)
#raw_contents <- result1$content
#json_raw <- httr::content(raw_contents, type = "text")
# Show Output
#data2 <- toJSON(Output)
#d2<-jsonlite::fromJSON(data2, simplifyDataFrame = TRUE)
#df3$score <- d2$documents$score
#df3$id <- d2$documents$id
This data set has 180 reviews by different customer type.
#Plot
barplot(prop.table(table(df3$bot)), main = "Reviews by Customer Type", xlab="Customer Type", ylab = "Review Count")
#df3$rating[df3$score >= 0.6] <- "Positive"
#df3$rating[df3$score < 0.6] <- "Negative"
#df3$rating <- as.factor(df3$rating)
#write.csv(df1, file = "camera.csv", row.names = FALSE)
#Analyze
#Prepare train and test
## 75% of the sample size
smp_size <- floor(0.75 * nrow(df3))
## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(df3)), size = smp_size)
train <- df3[train_ind, ]
test <- df3[-train_ind, ]
nrow(train)
## [1] 135
summary(train)
## comments bot rating
## Length:135 :63 Negative: 8
## Class :character Beginner : 2 Positive:127
## Mode :character Hobbyist :45
## Professional: 8
## Semi-Pro :17
nrow(test)
## [1] 45
summary(test)
## comments bot rating
## Length:45 :21 Negative: 4
## Class :character Beginner : 0 Positive:41
## Mode :character Hobbyist :16
## Professional: 3
## Semi-Pro : 5
barplot(prop.table(table(train$rating)),main = "Review Rating in Training Data", ylab = "Review Count")
barplot(prop.table(table(test$rating)),main = "Review Rating in Test Data ", ylab = "Review Count")
train_corpus <- Corpus(VectorSource(train$comments))
length(train_corpus)
## [1] 135
train_corpus <- tm_map(train_corpus, content_transformer(tolower))
train_corpus <- tm_map(train_corpus, removeWords, stopwords("english") )
train_corpus <- tm_map(train_corpus, removePunctuation)
train_corpus <- tm_map(train_corpus, removeNumbers)
train_corpus <- tm_map(train_corpus, stripWhitespace)
train_corpus_st <- tm_map(train_corpus, stemDocument,language="english" )
#Document term matrix
train_dtm <- DocumentTermMatrix(train_corpus_st)
dim(train_dtm)
## [1] 135 1624
train_rmspa <- removeSparseTerms(train_dtm,0.95)
dim(train_rmspa)
## [1] 135 222
#show the avg frequency of top 20 frequent words
mean_train = sort(colMeans(as.matrix(train_rmspa)), decreasing = T)
mean_train[1:20]
## camera use nikon great get one will
## 2.1333333 0.9259259 0.7481481 0.5333333 0.4888889 0.4888889 0.4592593
## shoot pictur can good focus take imag
## 0.4296296 0.4148148 0.4074074 0.4074074 0.3851852 0.3703704 0.3629630
## photo featur set like need qualiti
## 0.3629630 0.3333333 0.3259259 0.3185185 0.3185185 0.3185185
avg_top20=mean(mean_train[1:20])
avg_top20
## [1] 0.5266667
barplot(mean_train[1:20], border=NA, las =3, main ='Top 20 words (by mean) of training data', ylab='Frequency', ylim=c(0,3))
tr_rmspa_m_no_zero=as.matrix(train_rmspa)
#convert zeros to NA.
is.na(tr_rmspa_m_no_zero) <- tr_rmspa_m_no_zero==0
# calculate mean without taking NA
mean_tr_m = sort(colMeans(as.matrix(tr_rmspa_m_no_zero ), na.rm = T), decreasing = T)
mean_tr_m[1:20]
## camera buffer pictur focus card problem bodi sensor
## 2.691589 2.250000 2.000000 1.925926 1.923077 1.900000 1.894737 1.888889
## photo use can nikon shoot take iso one
## 1.814815 1.811594 1.774194 1.771930 1.757576 1.724138 1.714286 1.692308
## set imag will issu
## 1.692308 1.689655 1.675676 1.636364
#average frequency of these top 20 words
avg_top20=mean(mean_tr_m[1:20])
avg_top20
## [1] 1.861453
barplot(mean_tr_m[1:20], border=NA, las =3, main='Top 20 words(after removing NA from Matrix)', ylab='Frequency', ylim=c(0,3))
Sentiment classification is done based on the frequence of the terms.
#############################################################
#Sentiment classification
train_bow_freq <- as.matrix(train_rmspa)
#combine the rating with term frequencies of the BOW in training data frame.
train_data_m=data.frame(y=as.factor(train$rating), x= train_bow_freq)
#summary(train_data_m)
#head(str(train_data_m),5)
train_bow_m = findFreqTerms(train_rmspa)
length(train_bow_m)
## [1] 222
##Test data
test_corpus <- Corpus(VectorSource(test$comments))
#generate test DTM based on training list of words.
bow_test_m <- DocumentTermMatrix(test_corpus, control = list(tolower = T,stopwords = T, removePunctuation = T,removeNumbers = T, stripWhitespace = T, stemming = T, dictionary = train_bow_m) )
#head(str(bow_test_m),5)
dim (bow_test_m)
## [1] 45 221
# transform to matrix.
test_bowfreq_m <- as.matrix(bow_test_m)
# combine the sentiment rating with term freq of the bow in test data frame.
test_data_m <- data.frame(y=as.factor(test$rating), x= test_bowfreq_m)
#head(str(test_data_m),5)
library(e1071)
#Build Naive Bayes Modal on training data set.
bow_nb_m <- naiveBayes(y ~., data = train_data_m)
summary(bow_nb_m)
## Length Class Mode
## apriori 2 table numeric
## tables 222 -none- list
## levels 2 -none- character
## call 4 -none- call
##generate prediction for testing data
testpred = predict(bow_nb_m, newdata=test_data_m)
confusionMatrix(testpred, test_data_m[,1], positive="Positive",dnn=c("Prediction","True"))
## Confusion Matrix and Statistics
##
## True
## Prediction Negative Positive
## Negative 4 41
## Positive 0 0
##
## Accuracy : 0.0889
## 95% CI : (0.0248, 0.2122)
## No Information Rate : 0.9111
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : 4.185e-10
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.08889
## Prevalence : 0.91111
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Positive
##
mmetric(testpred,test_data_m[,1], c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 8.888889 8.888889 0.000000 100.000000 0.000000 16.326531
## F12
## 0.000000
Sentiment classification is done based on Binary Weights rather than term frequency to compare between the two.
#Binary Weights
#Document term matrix with binary weights
train_bin_dtm <- DocumentTermMatrix(train_corpus_st, control=list(weighting=weightBin))
dim(train_bin_dtm)
## [1] 135 1624
#head(str(train_bin_dtm),5)
train_rmspa_bin_dtm = removeSparseTerms(train_bin_dtm,0.95)
dim(train_rmspa_bin_dtm)
## [1] 135 222
#bag of words
train_bow_95 = findFreqTerms(train_rmspa_bin_dtm)
#transfor to matrix
train_bow_bin <- as.matrix(train_rmspa_bin_dtm)
#show the avg frequency of top 20 frequent words
mean_train_bin = sort(colMeans(as.matrix(train_bow_bin)), decreasing = T)
mean_train_bin[1:20]
## camera use nikon great get one good
## 0.7925926 0.5111111 0.4222222 0.3407407 0.3185185 0.2888889 0.2740741
## will like shoot qualiti can just imag
## 0.2740741 0.2518519 0.2444444 0.2370370 0.2296296 0.2222222 0.2148148
## need take time featur pictur photo
## 0.2148148 0.2148148 0.2148148 0.2074074 0.2074074 0.2000000
avg_top20_bin=mean(mean_train_bin[1:20])
avg_top20_bin
## [1] 0.2940741
barplot(mean_train_bin[1:20], border=NA, las =3, main ='Top 20 words(Binary Weights)', ylab='Frequency', ylim=c(0,1))
#combine the rating with term frequencies of the BOW in training data frame.
train_data_bin_m=data.frame(y=as.factor(train$rating), x= train_bow_bin)
#summary(train_data_m)
#head(str(train_data_bin_m),5)
dim(train_data_bin_m)
## [1] 135 223
##Test data
#test_corpus
#generate test DTM based on training list of words.
test_dtm_bin <- DocumentTermMatrix(test_corpus, control = list(tolower = T,stopwords = T, removePunctuation = T,removeNumbers = T, stripWhitespace = T, stemming = T, dictionary = train_bow_95, list(weighting= weightBin) ))
#head(str(test_dtm_bin),5)
dim (test_dtm_bin)
## [1] 45 221
# transform to matrix.
test_dtm_bin_m <- as.matrix(test_dtm_bin)
# combine the sentiment rating with term freq of the bow in test data frame.
test_data_dtm_bin_m <- data.frame(y=as.factor(test$rating), x= test_dtm_bin_m)
#head(str(test_data_dtm_bin_m),5)
dim(test_data_dtm_bin_m)
## [1] 45 222
library(e1071)
#Build Naive Bayes Modal on training data set.
bow_nb_bin_m <- naiveBayes(y ~., data = train_data_bin_m)
summary(bow_nb_bin_m)
## Length Class Mode
## apriori 2 table numeric
## tables 222 -none- list
## levels 2 -none- character
## call 4 -none- call
##generate prediction for testing data
testpred = predict(bow_nb_bin_m, newdata=test_data_dtm_bin_m)
confusionMatrix(testpred, test_data_dtm_bin_m[,1], positive="Positive",dnn=c("Prediction","True"))
## Confusion Matrix and Statistics
##
## True
## Prediction Negative Positive
## Negative 4 41
## Positive 0 0
##
## Accuracy : 0.0889
## 95% CI : (0.0248, 0.2122)
## No Information Rate : 0.9111
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : 4.185e-10
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.08889
## Prevalence : 0.91111
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Positive
##
mmetric(testpred,test_data_dtm_bin_m[,1], c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 8.888889 8.888889 0.000000 100.000000 0.000000 16.326531
## F12
## 0.000000
Sentiment classification using Term Frequency Inverse Document Frequency- TfIdf.
#TfIdf
#Document term matrix with binary weights
train_tfidf_dtm <- DocumentTermMatrix(train_corpus_st, control=list(weighting=weightTfIdf))
dim(train_tfidf_dtm)
## [1] 135 1624
#head(str(train_bin_dtm),5)
train_rmspa_tfidf_dtm = removeSparseTerms(train_tfidf_dtm,0.95)
dim(train_rmspa_tfidf_dtm)
## [1] 135 222
#bag of words
train_bow_95 = findFreqTerms(train_rmspa_tfidf_dtm)
#transfor to matrix
train_bow_tfidf <- as.matrix(train_rmspa_tfidf_dtm)
#show the avg frequency of top 20 frequent words
mean_train_tfidf = sort(colMeans(as.matrix(train_bow_tfidf)), decreasing = T)
mean_train_tfidf[1:20]
## like excel everyth great happi love
## 0.02357553 0.02356929 0.02245487 0.02194978 0.02147346 0.02026070
## nikon good use camera pleas pictur
## 0.02019247 0.02006135 0.01963605 0.01820228 0.01792447 0.01786050
## imag easi time qualiti upgrad option
## 0.01742874 0.01657303 0.01617229 0.01613746 0.01582568 0.01508249
## bodi best
## 0.01506548 0.01495082
avg_top20_tfidf=mean(mean_train_tfidf[1:20])
avg_top20_tfidf
## [1] 0.01871984
barplot(mean_train_tfidf[1:20], border=NA, las =3, main ='Top 20 words(tfidf weights) in Training data', ylab='Frequency', ylim=c(0,0.5))
#combine the rating with term frequencies of the BOW in training data frame.
train_data_tfidf_m=data.frame(y=as.factor(train$rating), x= train_bow_tfidf)
#summary(train_data_m)
#head(str(train_data_bin_m),5)
dim(train_data_tfidf_m)
## [1] 135 223
##Test data
#test_corpus
#generate test DTM based on training list of words.
test_dtm_tfidf <- DocumentTermMatrix(test_corpus, control = list(tolower = T,stopwords = T, removePunctuation = T,removeNumbers = T, stripWhitespace = T, stemming = T, dictionary = train_bow_95, list( weighting= weightTfIdf) ))
#head(str(test_dtm_bin),5)
dim (test_dtm_tfidf)
## [1] 45 221
# transform to matrix.
test_dtm_tfidf_m <- as.matrix(test_dtm_tfidf)
# combine the sentiment rating with term freq of the bow in test data frame.
test_data_dtm_tfidf_m <- data.frame(y=as.factor(test$rating), x= test_dtm_tfidf_m)
#head(str(test_data_dtm_bin_m),5)
dim(test_data_dtm_tfidf_m)
## [1] 45 222
library(e1071)
#Build Naive Bayes Modal on training data set.
bow_nb_tfidf_m <- naiveBayes(y ~., data = train_data_tfidf_m)
summary(bow_nb_tfidf_m)
## Length Class Mode
## apriori 2 table numeric
## tables 222 -none- list
## levels 2 -none- character
## call 4 -none- call
##generate prediction for testing data
testpred = predict(bow_nb_tfidf_m, newdata=test_data_dtm_tfidf_m)
confusionMatrix(testpred, test_data_dtm_tfidf_m[,1], positive="Positive", dnn=c("Prediction","True"))
## Confusion Matrix and Statistics
##
## True
## Prediction Negative Positive
## Negative 3 34
## Positive 1 7
##
## Accuracy : 0.2222
## 95% CI : (0.112, 0.3709)
## No Information Rate : 0.9111
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0168
## Mcnemar's Test P-Value : 6.338e-08
##
## Sensitivity : 0.17073
## Specificity : 0.75000
## Pos Pred Value : 0.87500
## Neg Pred Value : 0.08108
## Prevalence : 0.91111
## Detection Rate : 0.15556
## Detection Prevalence : 0.17778
## Balanced Accuracy : 0.46037
##
## 'Positive' Class : Positive
##
mmetric(testpred,test_data_dtm_tfidf_m[,1], c("ACC","TPR","PRECISION","F1"))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 22.222222 8.108108 87.500000 75.000000 17.073171 14.634146
## F12
## 28.571429
dtm.docs <- train_dtm
docs.s <- train_corpus_st
sums <- colapply_simple_triplet_matrix(dtm.docs,FUN=sum)
sums <- sort(sums, decreasing=T)
# Functions
BigramTokenizer <- function(x) {RWeka::NGramTokenizer(x, RWeka::Weka_control(min=2, max=2))}
ThreegramTokenizer <- function(x) {RWeka::NGramTokenizer(x, RWeka::Weka_control(min=3, max=3))}
FourgramTokenizer <- function(x) {RWeka::NGramTokenizer(x, RWeka::Weka_control(min=4, max=4))}
# Bigrams
options(mc.cores=1)
dtm.docs.2g <- DocumentTermMatrix(docs.s, control=list(tokenize=BigramTokenizer))
#Threegrams
options(mc.cores=1)
dtm.docs.3g <- DocumentTermMatrix(docs.s, control=list(tokenize=ThreegramTokenizer))
#Fourgrams
options(mc.cores=1)
dtm.docs.4g <- DocumentTermMatrix(docs.s, control=list(tokenize=FourgramTokenizer))
# freqTerms.4g.docs <- findFreqTerms(dtm.docs.4g,20,Inf)
# To get the bigram dist
sums.2g <- colapply_simple_triplet_matrix(dtm.docs.2g,FUN=sum)
sums.2g <- sort(sums.2g, decreasing=T)
# To get the threegram dist
sums.3g <- colapply_simple_triplet_matrix(dtm.docs.3g,FUN=sum)
sums.3g <- sort(sums.3g, decreasing=T)
# To get the fourgram dist
sums.4g <- colapply_simple_triplet_matrix(dtm.docs.4g,FUN=sum)
sums.4g <- sort(sums.4g, decreasing=T)
barplot(sums.2g[1:20], border=NA, las =3, main ='Bigram Distribution of Top 20 frequent words', ylab='Count', ylim=c(0,300))
barplot(sums.3g[1:20], border=NA, las =3, main ='Three gram Distribution of Top 20 frequent words', ylab='Count', ylim=c(0,300))
barplot(sums.4g[1:20], border=NA, las =3, main ='Four gram Distribution of Top 20 frequent words', ylab='Count', ylim=c(0,300))
kMean plot using the Top 20 frequent words from the 4 gram dist of training data.
m <- as.matrix(sums.4g[1:20])
d <-dist(m)
#k means algorithm, 3 clusters,
kfit <- kmeans(d, 3, nstart=5)
#plot
clusplot(m, kfit$cluster, color=T, shade=T, labels=2, lines=0)
Based on the comparison of modals using term frequency, binary weights and tfidf, the observation is, Accuracy, F1 measure and all metrics has improved a lot with tfidf. Also rear terms have been promoted using tfidf as it measures how important a term is. Lot of positive words reported by IDF that did not appear with term frequency or binary weights. I believe the metrics will be even higher with a larger data set.