Data Reading

library(rjson)
library(plyr)
library(Matrix)
library(xgboost)
library(stringr)
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Cell_Phones_and_Accessories_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
cell<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Patio_Lawn_and_Garden_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
lawn<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
cell_red=cell[sample(194000,2000),5:7]
lawn_red=lawn[sample(13272,2000),5:7]

Data Processing

cell_red=rbind(cell_red,lawn_red)
#Removing all Punctiation except underscore & Digits
cell_red$rev1=gsub("[[:punct:]]", " ", cell_red$reviewText)
cell_red$rev1 = gsub("[[:digit:]]", " ", cell_red$rev1)
cell_red$rev1=gsub("\\s+"," ", str_trim(cell_red$rev1))
#Assigning Actual Classes
cell_red$class[1:2000]="Cell Phones"
cell_red$class[2001:4000]="Gardening Equipment"
#Removing Blanks
cell_red=cell_red[cell_red$reviewText!="",]

Sparse Matrix Creation using tm library

library(tm)

## Loading required package: NLP

library(SnowballC)
data2=Corpus(VectorSource(cell_red$rev1))
#AdjustingtheformattoPlainTextDocument.EssentialforcreationofTermDocumentMatrix
data2=tm_map(data2,PlainTextDocument)
#RemovingStopwordslikea,the,for,etc.thatappearoftenbutprovidenoinformation
data2=tm_map(data2,removeWords,stopwords())
#Converting to lower case
data2=tm_map(data2,content_transformer(tolower))
#Converting to root word
data2=tm_map(data2,stemDocument)
data2_tdm=DocumentTermMatrix(data2)
vars=findFreqTerms(data2_tdm,5)
data2_tdm_red=data2_tdm[,intersect(colnames(data2_tdm),vars)]
dim(data2_tdm_red)

## [1] 3997 3684

Unsupervised Learning Model

library(topicmodels)
#Assigning LDA parameters for Gibbs sampling
 burnin <- 4000 #Ignoring first few iterations
 iter <- 2000 # Iterations post burn in 
 thin <- 500 #Picking iterations that are far apart to ensure independence
 seed <-list(2003,5,63,100001,765) #Seeds for reproducability
 nstart <- 5 #No of reruns
 best <- TRUE #Return only the best class
 k=2 #No. of classes. Often needs to be guessed but not in this case luckily
#Running the model 
ldaOut <-LDA(data2_tdm_red, k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
#Comparing Predictions to Actual
ldaOut.topics <- as.matrix(topics(ldaOut))
table(cell_red$class,ldaOut.topics)

##                      ldaOut.topics
##                          1    2
##   Cell Phones         1830  168
##   Gardening Equipment  576 1423

LDA does an excellent job in identifying themes and hence classifying the customer feedback

Unsupervised Classification

Why?

Data Reading

Data Processing

Sparse Matrix Creation using tm library

Unsupervised Learning Model