Sentiment Analysis for Detecting Unethical Sales Practices

Lekshman Ramesh

March 5, 2017

Why?

How?

Data Reading

library(rjson)
library(plyr)
library(Matrix)
library(xgboost)
library(stringr)
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Cell_Phones_and_Accessories_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
cell<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
cell[1,]
##       reviewerID       asin reviewerName helpful1 helpful2
## 1 A30TL5EWN6DFXT 120401325X    christina        0        0
##                                                                                                                                                                                      reviewText
## 1 They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again
##   overall    summary unixReviewTime  reviewTime
## 1       4 Looks Good     1400630400 05 21, 2014
#KeepingonlyrequiredVariables
cell_red=cell[,5:7]

Data Processing

data1=cell_red[cell_red$overall %in% c(1,2),] 
data2=cell_red[cell_red$overall %in% c(4,5),]
data2=data2[sample(148657,24343),]
cell_red=rbind(data1,data2)
cell_red=cell_red[sample(48686,10000),]
##Pre-ProcessingData
cell_red$rev1=gsub("n't"," not___",cell_red$reviewText)
cell_red$rev1=gsub("not","not___",cell_red$rev1) 
#Removing all Punctiation except underscore & Digits
cell_red$rev1=gsub("[^[:alnum:][:blank:]_\\-]", " ", cell_red$rev1)
cell_red$rev1 = gsub("[[:digit:]]", " ", cell_red$rev1)
cell_red$rev1=gsub("\\s+"," ", str_trim(cell_red$rev1))
#Excluding blanks
cell_red=cell_red[cell_red$reviewText!="",]

Sparse Matrix Creation using tm library

library(tm)
## Loading required package: NLP
data2=Corpus(VectorSource(cell_red$rev1))
#AdjustingtheformattoPlainTextDocument.EssentialforcreationofTermDocumentMatrix
data2=tm_map(data2,PlainTextDocument)
#RemovingStopwords
data2=tm_map(data2,removeWords,stopwords())
data2_tdm=DocumentTermMatrix(data2)
#RemovingInfrequentWords
vars=findFreqTerms(data2_tdm,5)
data2_tdm_red=data2_tdm[,intersect(colnames(data2_tdm),vars)]
call_mat <- sparseMatrix(i=data2_tdm_red$i, j=data2_tdm_red$j, x=data2_tdm_red$v,   dims=c(data2_tdm_red$nrow, data2_tdm_red$ncol))
#DimensionsOfTheMatrix
dim(call_mat)
## [1] 9995 5526

Machine Learning Model

xgmat_red=xgb.DMatrix(call_mat, label=ifelse(cell_red$overall %in% c(4,5),1,0))
param <- list(objective = "binary:logistic",
booster = "gbtree", eta = 0.01,
        subsample = 0.7,
           colsample_bytree = 0.7,
           max_depth = 50)
cv.res_red=xgb.cv(data=xgmat_red, nfold=4, nround = 700, params=param, print_every_n = 100)
## [1]  train-error:0.196433+0.006491   test-error:0.310352+0.010736 
## [101]    train-error:0.055061+0.001924   test-error:0.202501+0.002147 
## [201]    train-error:0.035284+0.000759   test-error:0.189695+0.005499 
## [301]    train-error:0.023812+0.001648   test-error:0.178789+0.004877 
## [401]    train-error:0.017575+0.001035   test-error:0.171086+0.003875 
## [501]    train-error:0.012840+0.000683   test-error:0.164482+0.004009 
## [601]    train-error:0.009705+0.000529   test-error:0.161880+0.003067 
## [700]    train-error:0.007904+0.000739   test-error:0.158379+0.003624

Parameter Optimization

xgb_grid_1=expand.grid(nrounds=200,eta=0.05,max_depth=c(50,75,100),
subsample=c(0.5,0.7,0.9),colsample_bytree=c(0.5,0.7,0.9))
#Parameter  Tuning
cv_final=NULL
for (i in 1:nrow(xgb_grid_1)){
param <- list(objective = "binary:logistic",
booster = "gbtree", eta = xgb_grid_1$eta[i],
        subsample = xgb_grid_1$subsample[i],
           colsample_bytree = xgb_grid_1$colsample_bytree[i],
           max_depth = xgb_grid_1$max_depth[i])
cv.res_red=xgb.cv(data=xgmat_red, nfold=4, nround = 200, params=param, verbose=0)
cv_op=cv.res_red$evaluation_log
cv_op=cv_op[200,]
cv_final=rbind(cv_op,cv_final)}