Sentiment Analysis for Detecting Unethical Sales Practices

Lekshman Ramesh

March 5, 2017

Why?

Mining employee feedback data to look for unethical practices
Feedback could be positive or negative
Exclude positive feedback to prepare data for futher analysis

How?

Publicly Available Amazon Reviews data
Process the data to account for negation(e.g. [n’t] = [ not])
Tokenize the data and turn it into a sparse matrix
Run a Machine Learning algorithm to train the model

Data Reading

library(rjson)
library(plyr)
library(Matrix)
library(xgboost)
library(stringr)
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Cell_Phones_and_Accessories_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
cell<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
cell[1,]

##       reviewerID       asin reviewerName helpful1 helpful2
## 1 A30TL5EWN6DFXT 120401325X    christina        0        0
##                                                                                                                                                                                      reviewText
## 1 They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again
##   overall    summary unixReviewTime  reviewTime
## 1       4 Looks Good     1400630400 05 21, 2014

#KeepingonlyrequiredVariables
cell_red=cell[,5:7]

Data Processing

data1=cell_red[cell_red$overall %in% c(1,2),] 
data2=cell_red[cell_red$overall %in% c(4,5),]
data2=data2[sample(148657,24343),]
cell_red=rbind(data1,data2)
cell_red=cell_red[sample(48686,10000),]
##Pre-ProcessingData
cell_red$rev1=gsub("n't"," not___",cell_red$reviewText)
cell_red$rev1=gsub("not","not___",cell_red$rev1) 
#Removing all Punctiation except underscore & Digits
cell_red$rev1=gsub("[^[:alnum:][:blank:]_\\-]", " ", cell_red$rev1)
cell_red$rev1 = gsub("[[:digit:]]", " ", cell_red$rev1)
cell_red$rev1=gsub("\\s+"," ", str_trim(cell_red$rev1))
#Excluding blanks
cell_red=cell_red[cell_red$reviewText!="",]

Sparse Matrix Creation using tm library

library(tm)

## Loading required package: NLP

data2=Corpus(VectorSource(cell_red$rev1))
#AdjustingtheformattoPlainTextDocument.EssentialforcreationofTermDocumentMatrix
data2=tm_map(data2,PlainTextDocument)
#RemovingStopwords
data2=tm_map(data2,removeWords,stopwords())
data2_tdm=DocumentTermMatrix(data2)
#RemovingInfrequentWords
vars=findFreqTerms(data2_tdm,5)
data2_tdm_red=data2_tdm[,intersect(colnames(data2_tdm),vars)]
call_mat <- sparseMatrix(i=data2_tdm_red$i, j=data2_tdm_red$j, x=data2_tdm_red$v,   dims=c(data2_tdm_red$nrow, data2_tdm_red$ncol))
#DimensionsOfTheMatrix
dim(call_mat)

## [1] 9995 5526

Machine Learning Model

xgmat_red=xgb.DMatrix(call_mat, label=ifelse(cell_red$overall %in% c(4,5),1,0))
param <- list(objective = "binary:logistic",
booster = "gbtree", eta = 0.01,
        subsample = 0.7,
           colsample_bytree = 0.7,
           max_depth = 50)
cv.res_red=xgb.cv(data=xgmat_red, nfold=4, nround = 700, params=param, print_every_n = 100)

## [1]  train-error:0.196433+0.006491   test-error:0.310352+0.010736 
## [101]    train-error:0.055061+0.001924   test-error:0.202501+0.002147 
## [201]    train-error:0.035284+0.000759   test-error:0.189695+0.005499 
## [301]    train-error:0.023812+0.001648   test-error:0.178789+0.004877 
## [401]    train-error:0.017575+0.001035   test-error:0.171086+0.003875 
## [501]    train-error:0.012840+0.000683   test-error:0.164482+0.004009 
## [601]    train-error:0.009705+0.000529   test-error:0.161880+0.003067 
## [700]    train-error:0.007904+0.000739   test-error:0.158379+0.003624

Clearly, the model though simple does a very good job of classifying customer feedback

Parameter Optimization

Tuned xgboost using the following code

xgb_grid_1=expand.grid(nrounds=200,eta=0.05,max_depth=c(50,75,100),
subsample=c(0.5,0.7,0.9),colsample_bytree=c(0.5,0.7,0.9))
#Parameter  Tuning
cv_final=NULL
for (i in 1:nrow(xgb_grid_1)){
param <- list(objective = "binary:logistic",
booster = "gbtree", eta = xgb_grid_1$eta[i],
        subsample = xgb_grid_1$subsample[i],
           colsample_bytree = xgb_grid_1$colsample_bytree[i],
           max_depth = xgb_grid_1$max_depth[i])
cv.res_red=xgb.cv(data=xgmat_red, nfold=4, nround = 200, params=param, verbose=0)
cv_op=cv.res_red$evaluation_log
cv_op=cv_op[200,]
cv_final=rbind(cv_op,cv_final)}