Overview:

In this Project, I am analyzing the sentiments of Consumer electronics, specifically on Camera reviews. I have taken 180 reviews of a camera from Adorama website. 

Libraries:

Following libraries were used:

rvest,stringr,RCurl,XML,stringr,httr,knitr,dplyr,RMySQL,mscstexta4r,jsonlite,tm,RWeka,
SnowballC,caret,rminer,kernlab,rpart,caTools,party,FSelector,class,randomForest,
RTextTools,syuzhet

Scraping:

I initially started scraping in R. Since I was having issues like time out and its not working consistently, I used webscraper tool to extract the reviews.  
options(stringsAsFactors = FALSE)

#site <- "https://www.adorama.com/r/inkd7100-reviews"
u <- "https://www.adorama.com/r/inkd7100-reviews?StartAt="
site <- lapply( seq(11,91,10), function(y){paste0(u, y, "#pr-review-sort")})

#do.call(rbind,lapply(site, function(x) {
#  
#       (data.frame(url=x,
#                   comments = read_html(GET(x,user_agent("myagent")))%>%
#                     html_nodes(".pr-comments")%>%html_text(),
#                   bot=read_html(GET(x,user_agent("myagent")))%>% #html_nodes(".pr-review-author-location")%>% html_text() %>%      str_replace("from" , "" ) #%>% str_replace_all( "\\r\\n" , "") %>% str_replace_all( "-" , "") %>% str_replace_all( "  " #, "") %>% trimws("l"))
#
#          )
#  
#}
#) )-> df11

#df <- merge (df, df, by=c("url","comments"))
#df3 <- data.frame(df1)
camera <- read.csv("C:/cuny/camera1.csv" ,header=TRUE,sep = ',')
df3 <- data.frame(camera)
sent <- get_nrc_sentiment(df3$comments)


#if (sent$positive >= sent$negative) {
#      df3$rating = "Positive"} else { df3$rating = "Negative"}

df3$rating <- as.factor(ifelse(sent$positive >= sent$negative,"Positive","Negative"))

Sentiment Scoring:

I initially used the Text Analytics API from Microsoft Azure to get a scoring on the reviews and it worked fine. I got issues with the second set of reviews that I used. Thought the return code was success, I had "no applicable method for 'content' applied to an object of class "response" error. So, I used package syuzhet to get the sentiment score. 
#converting customer type to factor
#trim <- function (x) gsub("^\\s+|\\s+$", "", x)
#df3$bot <- trim(df3$bot)
#df3$bot <- str_replace(df3$bot,"","General")
#df3$bot[df$bot==""] <- "Unknown"
df3$bot <- as.factor(df3$bot)
#sentiment scores
# Below is the Request body for the API having text id 1 = Negative sentiments, id 2 = Positive sentiments

#docsLanguage <- rep("en", length(df3$comments))
#revseq<-seq(1,length(df3$comments), by =1)

#request_body <- data.frame(
#language = docsLanguage,
#id = revseq, 
#text = df3$comments
#)

#request_body

# Converting the Request body(Dataframe) to Request body(JSON)

#request_body_json <- toJSON(list(documents = request_body), auto_unbox = TRUE)

#request_body_json

# Below we are calling API (Adding Request headers using add_headers)

#result1 <- POST("https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/sentiment",
#body = request_body_json,
#add_headers(.headers = c("Content-Type"="application/json",
#                         "Ocp-Apim-Subscription-Key"="e53ce6ebbdeb4be68dd85bc722418e23",
#                         "Accept" = "application/json")))

#Outputx <- content(result1)

#raw_contents <- result1$content  
#json_raw <- httr::content(raw_contents, type = "text")

# Show Output
#data2 <- toJSON(Output)
#d2<-jsonlite::fromJSON(data2, simplifyDataFrame = TRUE)

#df3$score <- d2$documents$score
#df3$id    <- d2$documents$id

Data Set:

This data set has 180 reviews by different customer type.
#Plot
barplot(prop.table(table(df3$bot)), main = "Reviews by Customer Type", xlab="Customer Type", ylab = "Review Count")

#df3$rating[df3$score >= 0.6] <- "Positive"
#df3$rating[df3$score < 0.6] <- "Negative"
#df3$rating <- as.factor(df3$rating)

#write.csv(df1, file = "camera.csv", row.names = FALSE)

Train and Test Data Set

#Analyze 

#Prepare train and test
## 75% of the sample size
smp_size <- floor(0.75 * nrow(df3))

## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(df3)), size = smp_size)

train <- df3[train_ind, ]
test <- df3[-train_ind, ]

nrow(train)
## [1] 135
summary(train)
##    comments                   bot          rating   
##  Length:135                     :63   Negative:  8  
##  Class :character   Beginner    : 2   Positive:127  
##  Mode  :character   Hobbyist    :45                 
##                     Professional: 8                 
##                     Semi-Pro    :17
nrow(test)
## [1] 45
summary(test)
##    comments                   bot          rating  
##  Length:45                      :21   Negative: 4  
##  Class :character   Beginner    : 0   Positive:41  
##  Mode  :character   Hobbyist    :16                
##                     Professional: 3                
##                     Semi-Pro    : 5
barplot(prop.table(table(train$rating)),main = "Review Rating in Training Data", ylab = "Review Count")

barplot(prop.table(table(test$rating)),main = "Review Rating in Test Data ", ylab = "Review Count")

train_corpus <- Corpus(VectorSource(train$comments))
length(train_corpus)
## [1] 135
train_corpus <- tm_map(train_corpus, content_transformer(tolower))
train_corpus <- tm_map(train_corpus, removeWords, stopwords("english") )
train_corpus <- tm_map(train_corpus, removePunctuation)
train_corpus <- tm_map(train_corpus, removeNumbers)
train_corpus <- tm_map(train_corpus, stripWhitespace)

train_corpus_st <- tm_map(train_corpus, stemDocument,language="english" )
#Document term matrix
train_dtm <- DocumentTermMatrix(train_corpus_st)
dim(train_dtm)
## [1]  135 1624
train_rmspa <- removeSparseTerms(train_dtm,0.95)
dim(train_rmspa)
## [1] 135 222
#show the avg frequency of top 20 frequent words 
mean_train = sort(colMeans(as.matrix(train_rmspa)), decreasing = T)
mean_train[1:20]
##    camera       use     nikon     great       get       one      will 
## 2.1333333 0.9259259 0.7481481 0.5333333 0.4888889 0.4888889 0.4592593 
##     shoot    pictur       can      good     focus      take      imag 
## 0.4296296 0.4148148 0.4074074 0.4074074 0.3851852 0.3703704 0.3629630 
##     photo    featur       set      like      need   qualiti 
## 0.3629630 0.3333333 0.3259259 0.3185185 0.3185185 0.3185185
avg_top20=mean(mean_train[1:20])
avg_top20
## [1] 0.5266667
barplot(mean_train[1:20], border=NA, las =3, main ='Top 20 words (by mean) of training data', ylab='Frequency', ylim=c(0,3))

tr_rmspa_m_no_zero=as.matrix(train_rmspa)
#convert zeros to NA.
is.na(tr_rmspa_m_no_zero) <- tr_rmspa_m_no_zero==0
# calculate mean without taking NA

mean_tr_m = sort(colMeans(as.matrix(tr_rmspa_m_no_zero ), na.rm = T), decreasing = T)
mean_tr_m[1:20]
##   camera   buffer   pictur    focus     card  problem     bodi   sensor 
## 2.691589 2.250000 2.000000 1.925926 1.923077 1.900000 1.894737 1.888889 
##    photo      use      can    nikon    shoot     take      iso      one 
## 1.814815 1.811594 1.774194 1.771930 1.757576 1.724138 1.714286 1.692308 
##      set     imag     will     issu 
## 1.692308 1.689655 1.675676 1.636364
#average frequency of these top 20 words

avg_top20=mean(mean_tr_m[1:20])
avg_top20
## [1] 1.861453
barplot(mean_tr_m[1:20], border=NA, las =3, main='Top 20 words(after removing NA from Matrix)', ylab='Frequency', ylim=c(0,3))

Sentiment classification:

 Sentiment classification is done based on the frequence of the terms.
#############################################################
#Sentiment classification
train_bow_freq <- as.matrix(train_rmspa)
#combine the rating with term frequencies of the BOW in training data frame.
train_data_m=data.frame(y=as.factor(train$rating), x= train_bow_freq)
#summary(train_data_m)
#head(str(train_data_m),5)

train_bow_m = findFreqTerms(train_rmspa)
length(train_bow_m)
## [1] 222
##Test data
test_corpus <- Corpus(VectorSource(test$comments))
#generate test DTM based on training list of words.
bow_test_m <- DocumentTermMatrix(test_corpus, control = list(tolower = T,stopwords = T,  removePunctuation = T,removeNumbers = T, stripWhitespace = T, stemming = T, dictionary = train_bow_m) )

#head(str(bow_test_m),5)
dim (bow_test_m)
## [1]  45 221
# transform to matrix.
test_bowfreq_m <- as.matrix(bow_test_m)
# combine the sentiment rating with term freq of the bow in test data frame.
test_data_m <- data.frame(y=as.factor(test$rating), x= test_bowfreq_m)
#head(str(test_data_m),5)

NB Modal for Term Frequency

library(e1071)

#Build Naive Bayes  Modal on training data set.
bow_nb_m <- naiveBayes(y ~., data = train_data_m)
summary(bow_nb_m)
##         Length Class  Mode     
## apriori   2    table  numeric  
## tables  222    -none- list     
## levels    2    -none- character
## call      4    -none- call
##generate prediction for testing data
testpred = predict(bow_nb_m, newdata=test_data_m)  

confusionMatrix(testpred, test_data_m[,1], positive="Positive",dnn=c("Prediction","True"))
## Confusion Matrix and Statistics
## 
##           True
## Prediction Negative Positive
##   Negative        4       41
##   Positive        0        0
##                                           
##                Accuracy : 0.0889          
##                  95% CI : (0.0248, 0.2122)
##     No Information Rate : 0.9111          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 4.185e-10       
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.08889         
##              Prevalence : 0.91111         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : Positive        
## 
mmetric(testpred,test_data_m[,1], c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   8.888889   8.888889   0.000000 100.000000   0.000000  16.326531 
##        F12 
##   0.000000

Binary Weights:

Sentiment classification is done based on Binary Weights rather than term frequency to compare between the two.
#Binary Weights

#Document term matrix with binary weights

train_bin_dtm <- DocumentTermMatrix(train_corpus_st, control=list(weighting=weightBin))
dim(train_bin_dtm)
## [1]  135 1624
#head(str(train_bin_dtm),5)
train_rmspa_bin_dtm = removeSparseTerms(train_bin_dtm,0.95)
dim(train_rmspa_bin_dtm)
## [1] 135 222
#bag of words
train_bow_95 = findFreqTerms(train_rmspa_bin_dtm)
#transfor to matrix
train_bow_bin <- as.matrix(train_rmspa_bin_dtm)

#show the avg frequency of top 20 frequent words 
mean_train_bin = sort(colMeans(as.matrix(train_bow_bin)), decreasing = T)
mean_train_bin[1:20]
##    camera       use     nikon     great       get       one      good 
## 0.7925926 0.5111111 0.4222222 0.3407407 0.3185185 0.2888889 0.2740741 
##      will      like     shoot   qualiti       can      just      imag 
## 0.2740741 0.2518519 0.2444444 0.2370370 0.2296296 0.2222222 0.2148148 
##      need      take      time    featur    pictur     photo 
## 0.2148148 0.2148148 0.2148148 0.2074074 0.2074074 0.2000000
avg_top20_bin=mean(mean_train_bin[1:20])
avg_top20_bin
## [1] 0.2940741
barplot(mean_train_bin[1:20], border=NA, las =3, main ='Top 20 words(Binary Weights)', ylab='Frequency', ylim=c(0,1))

#combine the rating with term frequencies of the BOW in training data frame.
train_data_bin_m=data.frame(y=as.factor(train$rating), x= train_bow_bin)
#summary(train_data_m)
#head(str(train_data_bin_m),5)
dim(train_data_bin_m)
## [1] 135 223
##Test data
#test_corpus 
#generate test DTM based on training list of words.
test_dtm_bin <- DocumentTermMatrix(test_corpus, control = list(tolower = T,stopwords = T,  removePunctuation = T,removeNumbers = T, stripWhitespace = T, stemming = T, dictionary = train_bow_95, list(weighting= weightBin)  ))

#head(str(test_dtm_bin),5)
dim (test_dtm_bin)
## [1]  45 221
# transform to matrix.
test_dtm_bin_m <- as.matrix(test_dtm_bin)
# combine the sentiment rating with term freq of the bow in test data frame.
test_data_dtm_bin_m <- data.frame(y=as.factor(test$rating), x= test_dtm_bin_m)
#head(str(test_data_dtm_bin_m),5)
dim(test_data_dtm_bin_m)
## [1]  45 222
library(e1071)
#Build Naive Bayes  Modal on training data set.
bow_nb_bin_m <- naiveBayes(y ~., data = train_data_bin_m)
summary(bow_nb_bin_m)
##         Length Class  Mode     
## apriori   2    table  numeric  
## tables  222    -none- list     
## levels    2    -none- character
## call      4    -none- call
##generate prediction for testing data
testpred = predict(bow_nb_bin_m, newdata=test_data_dtm_bin_m)  

confusionMatrix(testpred, test_data_dtm_bin_m[,1], positive="Positive",dnn=c("Prediction","True"))
## Confusion Matrix and Statistics
## 
##           True
## Prediction Negative Positive
##   Negative        4       41
##   Positive        0        0
##                                           
##                Accuracy : 0.0889          
##                  95% CI : (0.0248, 0.2122)
##     No Information Rate : 0.9111          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 4.185e-10       
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.08889         
##              Prevalence : 0.91111         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : Positive        
## 
mmetric(testpred,test_data_dtm_bin_m[,1], c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##   8.888889   8.888889   0.000000 100.000000   0.000000  16.326531 
##        F12 
##   0.000000

TfIdf:

Sentiment classification using Term Frequency Inverse Document Frequency- TfIdf. 
#TfIdf

#Document term matrix with binary weights

train_tfidf_dtm <- DocumentTermMatrix(train_corpus_st, control=list(weighting=weightTfIdf))
dim(train_tfidf_dtm)
## [1]  135 1624
#head(str(train_bin_dtm),5)
train_rmspa_tfidf_dtm = removeSparseTerms(train_tfidf_dtm,0.95)
dim(train_rmspa_tfidf_dtm)
## [1] 135 222
#bag of words
train_bow_95 = findFreqTerms(train_rmspa_tfidf_dtm)
#transfor to matrix
train_bow_tfidf <- as.matrix(train_rmspa_tfidf_dtm)

#show the avg frequency of top 20 frequent words 
mean_train_tfidf = sort(colMeans(as.matrix(train_bow_tfidf)), decreasing = T)
mean_train_tfidf[1:20]
##       like      excel    everyth      great      happi       love 
## 0.02357553 0.02356929 0.02245487 0.02194978 0.02147346 0.02026070 
##      nikon       good        use     camera      pleas     pictur 
## 0.02019247 0.02006135 0.01963605 0.01820228 0.01792447 0.01786050 
##       imag       easi       time    qualiti     upgrad     option 
## 0.01742874 0.01657303 0.01617229 0.01613746 0.01582568 0.01508249 
##       bodi       best 
## 0.01506548 0.01495082
avg_top20_tfidf=mean(mean_train_tfidf[1:20])
avg_top20_tfidf
## [1] 0.01871984
barplot(mean_train_tfidf[1:20], border=NA, las =3, main ='Top 20 words(tfidf weights) in Training data', ylab='Frequency', ylim=c(0,0.5))

#combine the rating with term frequencies of the BOW in training data frame.
train_data_tfidf_m=data.frame(y=as.factor(train$rating), x= train_bow_tfidf)
#summary(train_data_m)
#head(str(train_data_bin_m),5)
dim(train_data_tfidf_m)
## [1] 135 223
##Test data
#test_corpus 
#generate test DTM based on training list of words.
test_dtm_tfidf <- DocumentTermMatrix(test_corpus, control = list(tolower = T,stopwords = T,  removePunctuation = T,removeNumbers = T, stripWhitespace = T, stemming = T, dictionary = train_bow_95, list( weighting= weightTfIdf)  ))

#head(str(test_dtm_bin),5)
dim (test_dtm_tfidf)
## [1]  45 221
# transform to matrix.
test_dtm_tfidf_m <- as.matrix(test_dtm_tfidf)
# combine the sentiment rating with term freq of the bow in test data frame.
test_data_dtm_tfidf_m <- data.frame(y=as.factor(test$rating), x= test_dtm_tfidf_m)
#head(str(test_data_dtm_bin_m),5)
dim(test_data_dtm_tfidf_m)
## [1]  45 222
library(e1071)
#Build Naive Bayes  Modal on training data set.
bow_nb_tfidf_m <- naiveBayes(y ~., data = train_data_tfidf_m)
summary(bow_nb_tfidf_m)
##         Length Class  Mode     
## apriori   2    table  numeric  
## tables  222    -none- list     
## levels    2    -none- character
## call      4    -none- call
##generate prediction for testing data
testpred = predict(bow_nb_tfidf_m, newdata=test_data_dtm_tfidf_m)  

confusionMatrix(testpred, test_data_dtm_tfidf_m[,1], positive="Positive", dnn=c("Prediction","True"))
## Confusion Matrix and Statistics
## 
##           True
## Prediction Negative Positive
##   Negative        3       34
##   Positive        1        7
##                                          
##                Accuracy : 0.2222         
##                  95% CI : (0.112, 0.3709)
##     No Information Rate : 0.9111         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0168        
##  Mcnemar's Test P-Value : 6.338e-08      
##                                          
##             Sensitivity : 0.17073        
##             Specificity : 0.75000        
##          Pos Pred Value : 0.87500        
##          Neg Pred Value : 0.08108        
##              Prevalence : 0.91111        
##          Detection Rate : 0.15556        
##    Detection Prevalence : 0.17778        
##       Balanced Accuracy : 0.46037        
##                                          
##        'Positive' Class : Positive       
## 
mmetric(testpred,test_data_dtm_tfidf_m[,1], c("ACC","TPR","PRECISION","F1"))
##        ACC       TPR1       TPR2 PRECISION1 PRECISION2        F11 
##  22.222222   8.108108  87.500000  75.000000  17.073171  14.634146 
##        F12 
##  28.571429

N-gram Analysis:

dtm.docs <- train_dtm
docs.s <- train_corpus_st

sums <- colapply_simple_triplet_matrix(dtm.docs,FUN=sum)
sums <- sort(sums, decreasing=T)
# Functions
BigramTokenizer <- function(x) {RWeka::NGramTokenizer(x, RWeka::Weka_control(min=2, max=2))}
ThreegramTokenizer <- function(x) {RWeka::NGramTokenizer(x, RWeka::Weka_control(min=3, max=3))}
FourgramTokenizer <- function(x) {RWeka::NGramTokenizer(x, RWeka::Weka_control(min=4, max=4))}

# Bigrams
options(mc.cores=1)
dtm.docs.2g <- DocumentTermMatrix(docs.s, control=list(tokenize=BigramTokenizer))

#Threegrams
options(mc.cores=1)
dtm.docs.3g <- DocumentTermMatrix(docs.s, control=list(tokenize=ThreegramTokenizer))

#Fourgrams
options(mc.cores=1)
dtm.docs.4g <- DocumentTermMatrix(docs.s, control=list(tokenize=FourgramTokenizer))
# freqTerms.4g.docs <- findFreqTerms(dtm.docs.4g,20,Inf)

# To get the bigram dist
sums.2g <- colapply_simple_triplet_matrix(dtm.docs.2g,FUN=sum)
sums.2g <- sort(sums.2g, decreasing=T)

# To get the threegram dist
sums.3g <- colapply_simple_triplet_matrix(dtm.docs.3g,FUN=sum)
sums.3g <- sort(sums.3g, decreasing=T)

# To get the fourgram dist
sums.4g <- colapply_simple_triplet_matrix(dtm.docs.4g,FUN=sum)
sums.4g <- sort(sums.4g, decreasing=T)

barplot(sums.2g[1:20], border=NA, las =3, main ='Bigram Distribution of Top 20 frequent words', ylab='Count', ylim=c(0,300))

barplot(sums.3g[1:20], border=NA, las =3, main ='Three gram Distribution of Top 20 frequent words', ylab='Count', ylim=c(0,300))

barplot(sums.4g[1:20], border=NA, las =3, main ='Four gram Distribution of Top 20 frequent words', ylab='Count', ylim=c(0,300))

K Mean:

kMean plot using the Top 20 frequent words from the 4 gram dist of training data.

m <- as.matrix(sums.4g[1:20])
d <-dist(m)

#k means algorithm, 3 clusters, 
kfit <- kmeans(d, 3, nstart=5)
#plot
clusplot(m, kfit$cluster, color=T, shade=T, labels=2, lines=0)

Conclusion:

Based on the comparison of modals using term frequency, binary weights and tfidf, the observation is, Accuracy, F1 measure and all metrics has improved a lot with tfidf. Also rear terms have been promoted using tfidf as it measures how important a term is. Lot of positive words reported by IDF that did not appear with term frequency or binary weights. I believe the metrics will be even higher with a larger data set.

References